{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PGE2VZGHPUN44NAJLGDSDZV7XP","short_pith_number":"pith:PGE2VZGH","schema_version":"1.0","canonical_sha256":"7989aae4c77d1bce3409598721e6bfbbc2d4d2962527ac132d904dfd7bdbe444","source":{"kind":"arxiv","id":"2606.04923","version":1},"attestation_state":"computed","paper":{"title":"Reproducing, Analyzing, and Detecting Reward Hacking in Rubric-Based Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Hao Peng, Juanzi Li, Shuo Hou, Xiaozhi Wang, Xuekang Wang, Zhuoyuan Hao","submitted_at":"2026-06-03T14:18:23Z","abstract_excerpt":"Rubric-based reinforcement learning (RL) uses an LLM-as-a-Judge (LaaJ) to score model outputs according to rubrics as rewards. However, policy models may exploit latent biases in the judge, leading to reward hacking and ineffective or unsafe training outcomes. In real-world rubric-based RL, such hacking behaviors are often subtle and entangled with multiple judge biases, making them difficult to analyze, detect, and mitigate. In this paper, we introduce CHERRL, a controllable hacking environment for rubric-based RL. By injecting known biases into LaaJ, CHERRL enables stable reproduction of rew"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.04923","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-06-03T14:18:23Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"f72845742faa938f63c34813f75bd8006542bb9a7f73436fbe219c19c5b06fee","abstract_canon_sha256":"e66bd9dd42c961e5c4aa19a1599552151069ec823d78618ac1a2fee66e51e705"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-04T01:09:55.867878Z","signature_b64":"pBXBpQ8jQY1G7AM6eCE9CZ53BLeXK6V875zStLcoEijOOoMDAPuUzaSCDgbNzFivw4+/TYlOPuVAgeVgF52fCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7989aae4c77d1bce3409598721e6bfbbc2d4d2962527ac132d904dfd7bdbe444","last_reissued_at":"2026-06-04T01:09:55.867051Z","signature_status":"signed_v1","first_computed_at":"2026-06-04T01:09:55.867051Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Reproducing, Analyzing, and Detecting Reward Hacking in Rubric-Based Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Hao Peng, Juanzi Li, Shuo Hou, Xiaozhi Wang, Xuekang Wang, Zhuoyuan Hao","submitted_at":"2026-06-03T14:18:23Z","abstract_excerpt":"Rubric-based reinforcement learning (RL) uses an LLM-as-a-Judge (LaaJ) to score model outputs according to rubrics as rewards. However, policy models may exploit latent biases in the judge, leading to reward hacking and ineffective or unsafe training outcomes. In real-world rubric-based RL, such hacking behaviors are often subtle and entangled with multiple judge biases, making them difficult to analyze, detect, and mitigate. In this paper, we introduce CHERRL, a controllable hacking environment for rubric-based RL. By injecting known biases into LaaJ, CHERRL enables stable reproduction of rew"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.04923","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.04923/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.04923","created_at":"2026-06-04T01:09:55.867182+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.04923v1","created_at":"2026-06-04T01:09:55.867182+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.04923","created_at":"2026-06-04T01:09:55.867182+00:00"},{"alias_kind":"pith_short_12","alias_value":"PGE2VZGHPUN4","created_at":"2026-06-04T01:09:55.867182+00:00"},{"alias_kind":"pith_short_16","alias_value":"PGE2VZGHPUN44NAJ","created_at":"2026-06-04T01:09:55.867182+00:00"},{"alias_kind":"pith_short_8","alias_value":"PGE2VZGH","created_at":"2026-06-04T01:09:55.867182+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP","json":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP.json","graph_json":"https://pith.science/api/pith-number/PGE2VZGHPUN44NAJLGDSDZV7XP/graph.json","events_json":"https://pith.science/api/pith-number/PGE2VZGHPUN44NAJLGDSDZV7XP/events.json","paper":"https://pith.science/paper/PGE2VZGH"},"agent_actions":{"view_html":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP","download_json":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP.json","view_paper":"https://pith.science/paper/PGE2VZGH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.04923&json=true","fetch_graph":"https://pith.science/api/pith-number/PGE2VZGHPUN44NAJLGDSDZV7XP/graph.json","fetch_events":"https://pith.science/api/pith-number/PGE2VZGHPUN44NAJLGDSDZV7XP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP/action/storage_attestation","attest_author":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP/action/author_attestation","sign_citation":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP/action/citation_signature","submit_replication":"https://pith.science/pith/PGE2VZGHPUN44NAJLGDSDZV7XP/action/replication_record"}},"created_at":"2026-06-04T01:09:55.867182+00:00","updated_at":"2026-06-04T01:09:55.867182+00:00"}