{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:M3OIQ5MS6SYSDRRIORQJRRTGHL","short_pith_number":"pith:M3OIQ5MS","canonical_record":{"source":{"id":"2509.21882","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-26T05:06:25Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6f36e13162ef68d97956f49b23689294f4f70281f9e34ddbb08ddb6a4023b3cc","abstract_canon_sha256":"9ec84cad11b0802376973a89f9ce57bb8bc16d2e2018971b8964bbd670ed54fb"},"schema_version":"1.0"},"canonical_sha256":"66dc887592f4b121c628746098c6663af6225570419face1f7fef42442a90a32","source":{"kind":"arxiv","id":"2509.21882","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2509.21882","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"arxiv_version","alias_value":"2509.21882v3","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.21882","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"pith_short_12","alias_value":"M3OIQ5MS6SYS","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"pith_short_16","alias_value":"M3OIQ5MS6SYSDRRI","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"pith_short_8","alias_value":"M3OIQ5MS","created_at":"2026-05-27T01:04:51Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:M3OIQ5MS6SYSDRRIORQJRRTGHL","target":"record","payload":{"canonical_record":{"source":{"id":"2509.21882","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-26T05:06:25Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6f36e13162ef68d97956f49b23689294f4f70281f9e34ddbb08ddb6a4023b3cc","abstract_canon_sha256":"9ec84cad11b0802376973a89f9ce57bb8bc16d2e2018971b8964bbd670ed54fb"},"schema_version":"1.0"},"canonical_sha256":"66dc887592f4b121c628746098c6663af6225570419face1f7fef42442a90a32","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:04:51.234818Z","signature_b64":"bHpSPW3+/g3wzRfFqARb8ZDamdkb+AJUgBFVyipE6aJbWmWyYoU0I/95mJJDEG2A59of5CkLPEMTzJaJB5m9CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"66dc887592f4b121c628746098c6663af6225570419face1f7fef42442a90a32","last_reissued_at":"2026-05-27T01:04:51.234020Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:04:51.234020Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2509.21882","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:04:51Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"01wzh4v1zVs4XlOgeX03RAHy1AaLGuWdxaQT1/oyLcIRbSUa0AoQBFAW1pVgwiURPhyLJDCRJJHcbJlEU7ygAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T19:39:37.159482Z"},"content_sha256":"ae15c83744c39c1b152b69f1b7ffa07fed1a5fc8212c733dfcd6b030dddae886","schema_version":"1.0","event_id":"sha256:ae15c83744c39c1b152b69f1b7ffa07fed1a5fc8212c733dfcd6b030dddae886"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:M3OIQ5MS6SYSDRRIORQJRRTGHL","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Position: The Hidden Costs and Measurement Gaps of Reinforcement Learning with Verifiable Rewards","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Many reported RLVR gains on math and code tasks shrink or vanish once budgets, prompts, and contamination are controlled.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Aaron Tu, Amin Saberi, Bing Hu, Fang Wu, Ge Liu, Hanqun Cao, Heli Qi, Huaxiu Yao, Jure Leskovec, Li Erran Li, Nan Liu, Naoto Yokoya, Peng Xia, Qingcheng Zeng, Rui Yang, Shayan Talaei, Weihao Xuan, Wenqi Shi, Xiangru Tang, Xu Huang, Yejin Choi, Yijia Xiao, Yinxi Li, Yuchen Zhuang","submitted_at":"2025-09-26T05:06:25Z","abstract_excerpt":"Reinforcement learning with verifiable rewards (RLVR) is a practical, scalable way to improve large language models on math, code, and other structured tasks. However, we argue that many headline RLVR gains are not yet well validated because reports often conflate policy improvement with three confounds: (i) budget mismatch between RLVR and baseline evaluations, (ii) attempt inflation and calibration drift that convert abstentions into confident answers, and (iii) benchmark data contamination. Using budget-matched reproductions and partial-prompt contamination probes, we find that several wide"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Several widely cited gaps shrink substantially or disappear once budgets, prompts, and dataset versions are matched, and contaminated sets are treated as memorization probes rather than evidence of reasoning.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the budget-matched reproductions and partial-prompt contamination probes are representative of the headline results in the broader RLVR literature and that the three listed confounds are the dominant sources of overstated gains.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"The paper identifies confounds in RLVR evaluations that inflate apparent gains and proposes a minimum standard for budget-matched, contamination-aware assessment with calibration tracking.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Many reported RLVR gains on math and code tasks shrink or vanish once budgets, prompts, and contamination are controlled.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"86e4902016138906607718b94accc86fbd439c3b0c0b47ac38b8abe588b224ba"},"source":{"id":"2509.21882","kind":"arxiv","version":3},"verdict":{"id":"96c87c7e-e286-4c52-8400-4e8c5be4f38d","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-18T14:20:09.771666Z","strongest_claim":"Several widely cited gaps shrink substantially or disappear once budgets, prompts, and dataset versions are matched, and contaminated sets are treated as memorization probes rather than evidence of reasoning.","one_line_summary":"The paper identifies confounds in RLVR evaluations that inflate apparent gains and proposes a minimum standard for budget-matched, contamination-aware assessment with calibration tracking.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the budget-matched reproductions and partial-prompt contamination probes are representative of the headline results in the broader RLVR literature and that the three listed confounds are the dominant sources of overstated gains.","pith_extraction_headline":"Many reported RLVR gains on math and code tasks shrink or vanish once budgets, prompts, and contamination are controlled."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.21882/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"96c87c7e-e286-4c52-8400-4e8c5be4f38d"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:04:51Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"04HjY73srbkOJ2ie1sGuXmAqnhKWs5NqxmQeR29rWXD87fAznb3H+sx+6HHS6cY4J4/ajgJgULyOuolvIDzVCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T19:39:37.160036Z"},"content_sha256":"aa186340665113abbdf296a277f3bfd0fb2ae417d7df683525433a9d1f998fad","schema_version":"1.0","event_id":"sha256:aa186340665113abbdf296a277f3bfd0fb2ae417d7df683525433a9d1f998fad"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/M3OIQ5MS6SYSDRRIORQJRRTGHL/bundle.json","state_url":"https://pith.science/pith/M3OIQ5MS6SYSDRRIORQJRRTGHL/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/M3OIQ5MS6SYSDRRIORQJRRTGHL/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-09T19:39:37Z","links":{"resolver":"https://pith.science/pith/M3OIQ5MS6SYSDRRIORQJRRTGHL","bundle":"https://pith.science/pith/M3OIQ5MS6SYSDRRIORQJRRTGHL/bundle.json","state":"https://pith.science/pith/M3OIQ5MS6SYSDRRIORQJRRTGHL/state.json","well_known_bundle":"https://pith.science/.well-known/pith/M3OIQ5MS6SYSDRRIORQJRRTGHL/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:M3OIQ5MS6SYSDRRIORQJRRTGHL","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"9ec84cad11b0802376973a89f9ce57bb8bc16d2e2018971b8964bbd670ed54fb","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-26T05:06:25Z","title_canon_sha256":"6f36e13162ef68d97956f49b23689294f4f70281f9e34ddbb08ddb6a4023b3cc"},"schema_version":"1.0","source":{"id":"2509.21882","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2509.21882","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"arxiv_version","alias_value":"2509.21882v3","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.21882","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"pith_short_12","alias_value":"M3OIQ5MS6SYS","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"pith_short_16","alias_value":"M3OIQ5MS6SYSDRRI","created_at":"2026-05-27T01:04:51Z"},{"alias_kind":"pith_short_8","alias_value":"M3OIQ5MS","created_at":"2026-05-27T01:04:51Z"}],"graph_snapshots":[{"event_id":"sha256:aa186340665113abbdf296a277f3bfd0fb2ae417d7df683525433a9d1f998fad","target":"graph","created_at":"2026-05-27T01:04:51Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Several widely cited gaps shrink substantially or disappear once budgets, prompts, and dataset versions are matched, and contaminated sets are treated as memorization probes rather than evidence of reasoning."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the budget-matched reproductions and partial-prompt contamination probes are representative of the headline results in the broader RLVR literature and that the three listed confounds are the dominant sources of overstated gains."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"The paper identifies confounds in RLVR evaluations that inflate apparent gains and proposes a minimum standard for budget-matched, contamination-aware assessment with calibration tracking."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Many reported RLVR gains on math and code tasks shrink or vanish once budgets, prompts, and contamination are controlled."}],"snapshot_sha256":"86e4902016138906607718b94accc86fbd439c3b0c0b47ac38b8abe588b224ba"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2509.21882/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement learning with verifiable rewards (RLVR) is a practical, scalable way to improve large language models on math, code, and other structured tasks. However, we argue that many headline RLVR gains are not yet well validated because reports often conflate policy improvement with three confounds: (i) budget mismatch between RLVR and baseline evaluations, (ii) attempt inflation and calibration drift that convert abstentions into confident answers, and (iii) benchmark data contamination. Using budget-matched reproductions and partial-prompt contamination probes, we find that several wide","authors_text":"Aaron Tu, Amin Saberi, Bing Hu, Fang Wu, Ge Liu, Hanqun Cao, Heli Qi, Huaxiu Yao, Jure Leskovec, Li Erran Li, Nan Liu, Naoto Yokoya, Peng Xia, Qingcheng Zeng, Rui Yang, Shayan Talaei, Weihao Xuan, Wenqi Shi, Xiangru Tang, Xu Huang, Yejin Choi, Yijia Xiao, Yinxi Li, Yuchen Zhuang","cross_cats":["cs.AI"],"headline":"Many reported RLVR gains on math and code tasks shrink or vanish once budgets, prompts, and contamination are controlled.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-26T05:06:25Z","title":"Position: The Hidden Costs and Measurement Gaps of Reinforcement Learning with Verifiable Rewards"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.21882","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-18T14:20:09.771666Z","id":"96c87c7e-e286-4c52-8400-4e8c5be4f38d","model_set":{"reader":"grok-4.3"},"one_line_summary":"The paper identifies confounds in RLVR evaluations that inflate apparent gains and proposes a minimum standard for budget-matched, contamination-aware assessment with calibration tracking.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Many reported RLVR gains on math and code tasks shrink or vanish once budgets, prompts, and contamination are controlled.","strongest_claim":"Several widely cited gaps shrink substantially or disappear once budgets, prompts, and dataset versions are matched, and contaminated sets are treated as memorization probes rather than evidence of reasoning.","weakest_assumption":"That the budget-matched reproductions and partial-prompt contamination probes are representative of the headline results in the broader RLVR literature and that the three listed confounds are the dominant sources of overstated gains."}},"verdict_id":"96c87c7e-e286-4c52-8400-4e8c5be4f38d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ae15c83744c39c1b152b69f1b7ffa07fed1a5fc8212c733dfcd6b030dddae886","target":"record","created_at":"2026-05-27T01:04:51Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"9ec84cad11b0802376973a89f9ce57bb8bc16d2e2018971b8964bbd670ed54fb","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-26T05:06:25Z","title_canon_sha256":"6f36e13162ef68d97956f49b23689294f4f70281f9e34ddbb08ddb6a4023b3cc"},"schema_version":"1.0","source":{"id":"2509.21882","kind":"arxiv","version":3}},"canonical_sha256":"66dc887592f4b121c628746098c6663af6225570419face1f7fef42442a90a32","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"66dc887592f4b121c628746098c6663af6225570419face1f7fef42442a90a32","first_computed_at":"2026-05-27T01:04:51.234020Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T01:04:51.234020Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"bHpSPW3+/g3wzRfFqARb8ZDamdkb+AJUgBFVyipE6aJbWmWyYoU0I/95mJJDEG2A59of5CkLPEMTzJaJB5m9CQ==","signature_status":"signed_v1","signed_at":"2026-05-27T01:04:51.234818Z","signed_message":"canonical_sha256_bytes"},"source_id":"2509.21882","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ae15c83744c39c1b152b69f1b7ffa07fed1a5fc8212c733dfcd6b030dddae886","sha256:aa186340665113abbdf296a277f3bfd0fb2ae417d7df683525433a9d1f998fad"],"state_sha256":"b9bb80d290123ec2acedc24ada2799cff5d85932e110c889c8a202ae4bee53e8"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"CPw9a2hmW+n0Eypm8tMufGEHySchejXog2lP+3Bbea3GCLnxJ6RANWwsgZCykV8twn/6HVFHo7m4Uq7lMNjLAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-09T19:39:37.163557Z","bundle_sha256":"9db47485b327c3c561f7b9a9103ff9e3e74aeefe8f20da9607bdbcfbd10b8cde"}}