{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PIVXBVBAOLKWUQPKSIGOF4TZJ3","short_pith_number":"pith:PIVXBVBA","schema_version":"1.0","canonical_sha256":"7a2b70d42072d56a41ea920ce2f2794ede3ea8ab088cff4c6d9b13aae29c9460","source":{"kind":"arxiv","id":"2601.04805","version":2},"attestation_state":"computed","paper":{"title":"Thinking-Based Non-Thinking: Solving the Reward Hacking Problem in Training Hybrid Reasoning Models via Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Boyan Wang, Fanyu Meng, Jiaheng Liu, Jing Huo, Junlan Feng, Linjian Meng, Runqing Miao, Siyuan Gan, Tianpei Yang, Yang Gao, Yuyao Zhang","submitted_at":"2026-01-08T10:38:41Z","abstract_excerpt":"Large reasoning models (LRMs) have attracted much attention due to their exceptional performance. However, their performance mainly stems from thinking, a long Chain of Thought (CoT), which significantly increase computational overhead. To address this overthinking problem, existing work focuses on using reinforcement learning (RL) to train hybrid reasoning models that automatically decide whether to engage in thinking or not based on the complexity of the query. Unfortunately, using RL will suffer the the reward hacking problem, e.g., the model engages in thinking but is judged as not doing s"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.04805","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-08T10:38:41Z","cross_cats_sorted":[],"title_canon_sha256":"e0c71903d4878f268be809643cb56054467197833dec6ac734fde257a6a5c62c","abstract_canon_sha256":"0f9925be7a07d13e8c10a7d6fc85772d607ec78b535766ce4b78735db2434cc2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T01:05:12.455517Z","signature_b64":"42DPYbqXOIfFhHKXXvM93uJ6V1/t45bUudF7FHxjL4UlLSZ7UPzTTaEevfNw0FKIuH8eYouhwaP3RZ9xoIaCCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7a2b70d42072d56a41ea920ce2f2794ede3ea8ab088cff4c6d9b13aae29c9460","last_reissued_at":"2026-06-09T01:05:12.454962Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T01:05:12.454962Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Thinking-Based Non-Thinking: Solving the Reward Hacking Problem in Training Hybrid Reasoning Models via Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Boyan Wang, Fanyu Meng, Jiaheng Liu, Jing Huo, Junlan Feng, Linjian Meng, Runqing Miao, Siyuan Gan, Tianpei Yang, Yang Gao, Yuyao Zhang","submitted_at":"2026-01-08T10:38:41Z","abstract_excerpt":"Large reasoning models (LRMs) have attracted much attention due to their exceptional performance. However, their performance mainly stems from thinking, a long Chain of Thought (CoT), which significantly increase computational overhead. To address this overthinking problem, existing work focuses on using reinforcement learning (RL) to train hybrid reasoning models that automatically decide whether to engage in thinking or not based on the complexity of the query. Unfortunately, using RL will suffer the the reward hacking problem, e.g., the model engages in thinking but is judged as not doing s"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.04805","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.04805/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.04805","created_at":"2026-06-09T01:05:12.455027+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.04805v2","created_at":"2026-06-09T01:05:12.455027+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.04805","created_at":"2026-06-09T01:05:12.455027+00:00"},{"alias_kind":"pith_short_12","alias_value":"PIVXBVBAOLKW","created_at":"2026-06-09T01:05:12.455027+00:00"},{"alias_kind":"pith_short_16","alias_value":"PIVXBVBAOLKWUQPK","created_at":"2026-06-09T01:05:12.455027+00:00"},{"alias_kind":"pith_short_8","alias_value":"PIVXBVBA","created_at":"2026-06-09T01:05:12.455027+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3","json":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3.json","graph_json":"https://pith.science/api/pith-number/PIVXBVBAOLKWUQPKSIGOF4TZJ3/graph.json","events_json":"https://pith.science/api/pith-number/PIVXBVBAOLKWUQPKSIGOF4TZJ3/events.json","paper":"https://pith.science/paper/PIVXBVBA"},"agent_actions":{"view_html":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3","download_json":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3.json","view_paper":"https://pith.science/paper/PIVXBVBA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.04805&json=true","fetch_graph":"https://pith.science/api/pith-number/PIVXBVBAOLKWUQPKSIGOF4TZJ3/graph.json","fetch_events":"https://pith.science/api/pith-number/PIVXBVBAOLKWUQPKSIGOF4TZJ3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3/action/storage_attestation","attest_author":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3/action/author_attestation","sign_citation":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3/action/citation_signature","submit_replication":"https://pith.science/pith/PIVXBVBAOLKWUQPKSIGOF4TZJ3/action/replication_record"}},"created_at":"2026-06-09T01:05:12.455027+00:00","updated_at":"2026-06-09T01:05:12.455027+00:00"}