{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2022:PQQC7F4H7VGFOO4ZGUIJFAOAYL","short_pith_number":"pith:PQQC7F4H","schema_version":"1.0","canonical_sha256":"7c202f9787fd4c573b9935109281c0c2f54a705f54b072d7ce644bb3c15a73c0","source":{"kind":"arxiv","id":"2201.03544","version":2},"attestation_state":"computed","paper":{"title":"The Effects of Reward Misspecification: Mapping and Mitigating Misaligned Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Alexander Pan, Jacob Steinhardt, Kush Bhatia","submitted_at":"2022-01-10T18:58:52Z","abstract_excerpt":"Reward hacking -- where RL agents exploit gaps in misspecified reward functions -- has been widely observed, but not yet systematically studied. To understand how reward hacking arises, we construct four RL environments with misspecified rewards. We investigate reward hacking as a function of agent capabilities: model capacity, action space resolution, observation space noise, and training time. More capable agents often exploit reward misspecifications, achieving higher proxy reward and lower true reward than less capable agents. Moreover, we find instances of phase transitions: capability th"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2201.03544","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2022-01-10T18:58:52Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"095490091c403c86f8c23378791367d7633e83c21fbfa36d10be2655d5908b8e","abstract_canon_sha256":"6913c36b9ab8b86c32aae9efedbf89d259fbf5449156aa83678a4066709e0c76"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T07:03:14.354677Z","signature_b64":"gtbrrXowFli5ge/xaNPRtC4XDe8rr3G3FxBR3hbMFKEQ5Zvi380boKnbIfp+tTqMBlvk9qonWPm+Ar0nTRHXBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7c202f9787fd4c573b9935109281c0c2f54a705f54b072d7ce644bb3c15a73c0","last_reissued_at":"2026-05-21T07:03:14.352943Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T07:03:14.352943Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"The Effects of Reward Misspecification: Mapping and Mitigating Misaligned Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Alexander Pan, Jacob Steinhardt, Kush Bhatia","submitted_at":"2022-01-10T18:58:52Z","abstract_excerpt":"Reward hacking -- where RL agents exploit gaps in misspecified reward functions -- has been widely observed, but not yet systematically studied. To understand how reward hacking arises, we construct four RL environments with misspecified rewards. We investigate reward hacking as a function of agent capabilities: model capacity, action space resolution, observation space noise, and training time. More capable agents often exploit reward misspecifications, achieving higher proxy reward and lower true reward than less capable agents. Moreover, we find instances of phase transitions: capability th"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2201.03544","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2201.03544/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2201.03544","created_at":"2026-05-21T07:03:14.353018+00:00"},{"alias_kind":"arxiv_version","alias_value":"2201.03544v2","created_at":"2026-05-21T07:03:14.353018+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2201.03544","created_at":"2026-05-21T07:03:14.353018+00:00"},{"alias_kind":"pith_short_12","alias_value":"PQQC7F4H7VGF","created_at":"2026-05-21T07:03:14.353018+00:00"},{"alias_kind":"pith_short_16","alias_value":"PQQC7F4H7VGFOO4Z","created_at":"2026-05-21T07:03:14.353018+00:00"},{"alias_kind":"pith_short_8","alias_value":"PQQC7F4H","created_at":"2026-05-21T07:03:14.353018+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":23,"internal_anchor_count":23,"sample":[{"citing_arxiv_id":"2605.20202","citing_title":"Under Pressure: Emotional Framing Induces Measurable Behavioral Shifts and Structured Internal Geometry in Small Language Models","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2503.11926","citing_title":"Monitoring Reasoning Models for Misbehavior and the Risks of Promoting Obfuscation","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20744","citing_title":"Hack-Verifiable Environments: Towards Evaluating Reward Hacking at Scale","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21384","citing_title":"SpecBench: Measuring Reward Hacking in Long-Horizon Coding Agents","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16339","citing_title":"Preference Instability in Reward Models: Detection and Mitigation via Sparse Autoencoders","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16035","citing_title":"Who Owns This Agent? Tracing AI Agents Back to Their Owners","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2501.09732","citing_title":"Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18583","citing_title":"Overeager Coding Agents: Measuring Out-of-Scope Actions on Benign Tasks","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2210.10760","citing_title":"Scaling Laws for Reward Model Overoptimization","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2509.20265","citing_title":"Failure Modes of Maximum Entropy RLHF","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2401.05561","citing_title":"TrustLLM: Trustworthiness in Large Language Models","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2408.00724","citing_title":"Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models","ref_index":269,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02341","citing_title":"LLM Reasoning with Process Rewards for Outcome-Guided Steps","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2301.05217","citing_title":"Progress measures for grokking via mechanistic interpretability","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12809","citing_title":"Correcting Influence: Unboxing LLM Outputs with Orthogonal Latent Spaces","ref_index":223,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13554","citing_title":"Self-Supervised On-Policy Reinforcement Learning via Contrastive Proximal Policy Optimisation","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26360","citing_title":"Uncertainty-Aware Reward Discounting for Mitigating Reward Hacking","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08378","citing_title":"Reinforcement Learning for Scalable and Trustworthy Intelligent Systems","ref_index":81,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24966","citing_title":"Risk Reporting for Developers' Internal AI Model Use","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23338","citing_title":"A Systematic Survey of Security Threats and Defenses in LLM-Based AI Agents: A Layered Attack Surface Framework","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07754","citing_title":"The Art of (Mis)alignment: How Fine-Tuning Methods Effectively Misalign and Realign LLMs in Post-Training","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06392","citing_title":"Qualixar OS: A Universal Operating System for AI Agent Orchestration","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2207.05221","citing_title":"Language Models (Mostly) Know What They Know","ref_index":46,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL","json":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL.json","graph_json":"https://pith.science/api/pith-number/PQQC7F4H7VGFOO4ZGUIJFAOAYL/graph.json","events_json":"https://pith.science/api/pith-number/PQQC7F4H7VGFOO4ZGUIJFAOAYL/events.json","paper":"https://pith.science/paper/PQQC7F4H"},"agent_actions":{"view_html":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL","download_json":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL.json","view_paper":"https://pith.science/paper/PQQC7F4H","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2201.03544&json=true","fetch_graph":"https://pith.science/api/pith-number/PQQC7F4H7VGFOO4ZGUIJFAOAYL/graph.json","fetch_events":"https://pith.science/api/pith-number/PQQC7F4H7VGFOO4ZGUIJFAOAYL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL/action/storage_attestation","attest_author":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL/action/author_attestation","sign_citation":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL/action/citation_signature","submit_replication":"https://pith.science/pith/PQQC7F4H7VGFOO4ZGUIJFAOAYL/action/replication_record"}},"created_at":"2026-05-21T07:03:14.353018+00:00","updated_at":"2026-05-21T07:03:14.353018+00:00"}