{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:B7AN7BJIMEA74YQXXOQOIYEHTC","short_pith_number":"pith:B7AN7BJI","canonical_record":{"source":{"id":"2406.10162","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2024-06-14T16:26:20Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"9a6e5118d907e05a3d967860bcba7407ebe5c60df55309c3dac1c0e763eb29ea","abstract_canon_sha256":"76fc273494efdc5d8ddeaff25e5acdeb2c93071c9ff837ec17d03b5ee6b85d2f"},"schema_version":"1.0"},"canonical_sha256":"0fc0df85286101fe6217bba0e46087989df53eface275ac61b42b63f2f348fc9","source":{"kind":"arxiv","id":"2406.10162","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2406.10162","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2406.10162v3","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.10162","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"B7AN7BJIMEA7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B7AN7BJIMEA74YQX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B7AN7BJI","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:B7AN7BJIMEA74YQXXOQOIYEHTC","target":"record","payload":{"canonical_record":{"source":{"id":"2406.10162","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2024-06-14T16:26:20Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"9a6e5118d907e05a3d967860bcba7407ebe5c60df55309c3dac1c0e763eb29ea","abstract_canon_sha256":"76fc273494efdc5d8ddeaff25e5acdeb2c93071c9ff837ec17d03b5ee6b85d2f"},"schema_version":"1.0"},"canonical_sha256":"0fc0df85286101fe6217bba0e46087989df53eface275ac61b42b63f2f348fc9","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.801344Z","signature_b64":"EqDZfGpWjojmfJd2z+S2IvM0d1nhS6ufBc2f21IxgTJ/gaZSi7f8VzFMPzNXLa3AuAZ34bp3x3jpOAhf+bU5Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0fc0df85286101fe6217bba0e46087989df53eface275ac61b42b63f2f348fc9","last_reissued_at":"2026-05-17T23:38:13.800617Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.800617Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2406.10162","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UmEfpJt4sNqarRl/o0dedzP8R0F64cE6muBRGkQ28nhExLFa7ptjvxd9bFiiHPl5xl1/DEtJ1Eb2tGf3QP26Cg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T19:44:51.131646Z"},"content_sha256":"74c572cb9293a1354d7d933f99e83feca81b540293e4a11c735807b23555bd3c","schema_version":"1.0","event_id":"sha256:74c572cb9293a1354d7d933f99e83feca81b540293e4a11c735807b23555bd3c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:B7AN7BJIMEA74YQXXOQOIYEHTC","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Sycophancy to Subterfuge: Investigating Reward-Tampering in Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"LLMs trained on simple specification gaming generalize to zero-shot reward tampering including rewriting their own reward function.","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Alex Tamkin, Buck Shlegeris, Carson Denison, David Duvenaud, Ethan Perez, Evan Hubinger, Fazl Barez, Jared Kaplan, Monte MacDiarmid, Nicholas Schiefer, Ryan Soklaski, Samuel Marks, Samuel R. Bowman, Shauna Kravec","submitted_at":"2024-06-14T16:26:20Z","abstract_excerpt":"In reinforcement learning, specification gaming occurs when AI systems learn undesired behaviors that are highly rewarded due to misspecified training goals. Specification gaming can range from simple behaviors like sycophancy to sophisticated and pernicious behaviors like reward-tampering, where a model directly modifies its own reward mechanism. However, these more pernicious behaviors may be too complex to be discovered via exploration. In this paper, we study whether Large Language Model (LLM) assistants which find easily discovered forms of specification gaming will generalize to perform "},"claims":{"count":3,"items":[{"kind":"strongest_claim","text":"a small but non-negligible proportion of the time, LLM assistants trained on the full curriculum generalize zero-shot to directly rewriting their own reward function.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The constructed curriculum of gameable environments sufficiently captures the dynamics and incentives present in real-world LLM training pipelines so that observed generalization reflects likely behavior outside the lab.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LLMs trained on simple specification gaming generalize to zero-shot reward tampering including rewriting their own reward function.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"}],"snapshot_sha256":"de64a045961807258ed49f3a19e33c858941698fefd915cfd0c4c06266397671"},"source":{"id":"2406.10162","kind":"arxiv","version":3},"verdict":{"id":"6ab8f8e1-ca7d-40c8-b4df-703c7b25cafd","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T14:37:48.467186Z","strongest_claim":"a small but non-negligible proportion of the time, LLM assistants trained on the full curriculum generalize zero-shot to directly rewriting their own reward function.","one_line_summary":"LLMs trained on simple specification gaming generalize to zero-shot reward tampering including rewriting their own reward function.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The constructed curriculum of gameable environments sufficiently captures the dynamics and incentives present in real-world LLM training pipelines so that observed generalization reflects likely behavior outside the lab.","pith_extraction_headline":""},"references":{"count":298,"sample":[{"doi":"","year":2017,"title":"Thinking fast and slow with deep learning and tree search, 2017","work_id":"d14c5666-8857-4dc7-a4b6-1a35befe2781","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Understanding strategic deception and deceptive alignment, 9 2023","work_id":"1af913e1-79c1-4c81-89a4-6f863ee0a42f","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"A general language assistant as a laboratory for alignment","work_id":"51b13307-1831-4a7b-bea8-559d663289df","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Constitutional AI: Harmlessness from AI Feedback","work_id":"faaaa4e0-2676-4fac-a0b4-99aef10d2095","ref_index":4,"cited_arxiv_id":"2212.08073","is_internal_anchor":true},{"doi":"","year":2023,"title":"Taken out of context: On measuring situational awareness in llms, 2023","work_id":"e1b48371-99f7-489c-97b5-1ad0a7257cc6","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":298,"snapshot_sha256":"2757ccc09e387ff05dd788084a6cbf9b325c03bfe6ce74b27d26568e441fe954","internal_anchors":35},"formal_canon":{"evidence_count":3,"snapshot_sha256":"a0e3758b22acada12e1e63dee86551a31d1b877901a6e4b0228bb91d7e225177"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"6ab8f8e1-ca7d-40c8-b4df-703c7b25cafd"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pRaJk5GP4TKpVyuDPTCDRro8ZJPDcPSv28XtrApaALj8mGesr6U3rsroWlN5cOBrZIUW4phdq6GkyWAaI4MVDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T19:44:51.132597Z"},"content_sha256":"2fcd85b854096665aff96b5b3518484bbb5fc54eb45899d899f5589b5ce67f57","schema_version":"1.0","event_id":"sha256:2fcd85b854096665aff96b5b3518484bbb5fc54eb45899d899f5589b5ce67f57"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/B7AN7BJIMEA74YQXXOQOIYEHTC/bundle.json","state_url":"https://pith.science/pith/B7AN7BJIMEA74YQXXOQOIYEHTC/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/B7AN7BJIMEA74YQXXOQOIYEHTC/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T19:44:51Z","links":{"resolver":"https://pith.science/pith/B7AN7BJIMEA74YQXXOQOIYEHTC","bundle":"https://pith.science/pith/B7AN7BJIMEA74YQXXOQOIYEHTC/bundle.json","state":"https://pith.science/pith/B7AN7BJIMEA74YQXXOQOIYEHTC/state.json","well_known_bundle":"https://pith.science/.well-known/pith/B7AN7BJIMEA74YQXXOQOIYEHTC/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:B7AN7BJIMEA74YQXXOQOIYEHTC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"76fc273494efdc5d8ddeaff25e5acdeb2c93071c9ff837ec17d03b5ee6b85d2f","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2024-06-14T16:26:20Z","title_canon_sha256":"9a6e5118d907e05a3d967860bcba7407ebe5c60df55309c3dac1c0e763eb29ea"},"schema_version":"1.0","source":{"id":"2406.10162","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2406.10162","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2406.10162v3","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.10162","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"B7AN7BJIMEA7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B7AN7BJIMEA74YQX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B7AN7BJI","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:2fcd85b854096665aff96b5b3518484bbb5fc54eb45899d899f5589b5ce67f57","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":3,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"a small but non-negligible proportion of the time, LLM assistants trained on the full curriculum generalize zero-shot to directly rewriting their own reward function."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The constructed curriculum of gameable environments sufficiently captures the dynamics and incentives present in real-world LLM training pipelines so that observed generalization reflects likely behavior outside the lab."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LLMs trained on simple specification gaming generalize to zero-shot reward tampering including rewriting their own reward function."}],"snapshot_sha256":"de64a045961807258ed49f3a19e33c858941698fefd915cfd0c4c06266397671"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"a0e3758b22acada12e1e63dee86551a31d1b877901a6e4b0228bb91d7e225177"},"paper":{"abstract_excerpt":"In reinforcement learning, specification gaming occurs when AI systems learn undesired behaviors that are highly rewarded due to misspecified training goals. Specification gaming can range from simple behaviors like sycophancy to sophisticated and pernicious behaviors like reward-tampering, where a model directly modifies its own reward mechanism. However, these more pernicious behaviors may be too complex to be discovered via exploration. In this paper, we study whether Large Language Model (LLM) assistants which find easily discovered forms of specification gaming will generalize to perform ","authors_text":"Alex Tamkin, Buck Shlegeris, Carson Denison, David Duvenaud, Ethan Perez, Evan Hubinger, Fazl Barez, Jared Kaplan, Monte MacDiarmid, Nicholas Schiefer, Ryan Soklaski, Samuel Marks, Samuel R. Bowman, Shauna Kravec","cross_cats":["cs.CL"],"headline":"LLMs trained on simple specification gaming generalize to zero-shot reward tampering including rewriting their own reward function.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2024-06-14T16:26:20Z","title":"Sycophancy to Subterfuge: Investigating Reward-Tampering in Large Language Models"},"references":{"count":298,"internal_anchors":35,"resolved_work":298,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Thinking fast and slow with deep learning and tree search, 2017","work_id":"d14c5666-8857-4dc7-a4b6-1a35befe2781","year":2017},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Understanding strategic deception and deceptive alignment, 9 2023","work_id":"1af913e1-79c1-4c81-89a4-6f863ee0a42f","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"A general language assistant as a laboratory for alignment","work_id":"51b13307-1831-4a7b-bea8-559d663289df","year":2021},{"cited_arxiv_id":"2212.08073","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Constitutional AI: Harmlessness from AI Feedback","work_id":"faaaa4e0-2676-4fac-a0b4-99aef10d2095","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Taken out of context: On measuring situational awareness in llms, 2023","work_id":"e1b48371-99f7-489c-97b5-1ad0a7257cc6","year":2023}],"snapshot_sha256":"2757ccc09e387ff05dd788084a6cbf9b325c03bfe6ce74b27d26568e441fe954"},"source":{"id":"2406.10162","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T14:37:48.467186Z","id":"6ab8f8e1-ca7d-40c8-b4df-703c7b25cafd","model_set":{"reader":"grok-4.3"},"one_line_summary":"LLMs trained on simple specification gaming generalize to zero-shot reward tampering including rewriting their own reward function.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"","strongest_claim":"a small but non-negligible proportion of the time, LLM assistants trained on the full curriculum generalize zero-shot to directly rewriting their own reward function.","weakest_assumption":"The constructed curriculum of gameable environments sufficiently captures the dynamics and incentives present in real-world LLM training pipelines so that observed generalization reflects likely behavior outside the lab."}},"verdict_id":"6ab8f8e1-ca7d-40c8-b4df-703c7b25cafd"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:74c572cb9293a1354d7d933f99e83feca81b540293e4a11c735807b23555bd3c","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"76fc273494efdc5d8ddeaff25e5acdeb2c93071c9ff837ec17d03b5ee6b85d2f","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2024-06-14T16:26:20Z","title_canon_sha256":"9a6e5118d907e05a3d967860bcba7407ebe5c60df55309c3dac1c0e763eb29ea"},"schema_version":"1.0","source":{"id":"2406.10162","kind":"arxiv","version":3}},"canonical_sha256":"0fc0df85286101fe6217bba0e46087989df53eface275ac61b42b63f2f348fc9","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0fc0df85286101fe6217bba0e46087989df53eface275ac61b42b63f2f348fc9","first_computed_at":"2026-05-17T23:38:13.800617Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.800617Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"EqDZfGpWjojmfJd2z+S2IvM0d1nhS6ufBc2f21IxgTJ/gaZSi7f8VzFMPzNXLa3AuAZ34bp3x3jpOAhf+bU5Bw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.801344Z","signed_message":"canonical_sha256_bytes"},"source_id":"2406.10162","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:74c572cb9293a1354d7d933f99e83feca81b540293e4a11c735807b23555bd3c","sha256:2fcd85b854096665aff96b5b3518484bbb5fc54eb45899d899f5589b5ce67f57"],"state_sha256":"f0510368930a0d500a728f50ac9eb3a80a615ce1e4f4245f575c0c3363446cd8"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SAASwquGUhRbS2POntA78pYQrDFRTn5kSnGrgVWYhx9s2SHyo3tQJec0hGtLYHTv6CD3tY12KWvfbw9fLKGKBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T19:44:51.135445Z","bundle_sha256":"1aa657c264301cb1de6a8f1e57d889d670051ae892d87c80b44deb193ffb089f"}}