{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:IPCJLYTXHABXCBJAVSZU3LTGLN","short_pith_number":"pith:IPCJLYTX","schema_version":"1.0","canonical_sha256":"43c495e2773803710520acb34dae665b5d771af3ccfab5b7bf0dbe8f152e98e8","source":{"kind":"arxiv","id":"2506.04018","version":3},"attestation_state":"computed","paper":{"title":"AgentMisalignment: Measuring the Propensity for Misaligned Behaviour in LLM-Based Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL","cs.CY","cs.LG"],"primary_cat":"cs.AI","authors_text":"Akshat Naik, Edward James Young, Emma Goun\\'e, Francisco Javier Campos Zabala, Guillermo Bosch, Jason Ross Brown, Patrick Quinn","submitted_at":"2025-06-04T14:46:47Z","abstract_excerpt":"As Large Language Model (LLM) agents become more widespread, associated misalignment risks increase. While prior research has studied agents' ability to produce harmful outputs or follow malicious instructions, it remains unclear how likely agents are to spontaneously pursue unintended goals in realistic deployments. In this work, we approach misalignment as a conflict between the internal goals pursued by the model and the goals intended by its deployer. We introduce a misalignment propensity benchmark, \\textsc{AgentMisalignment}, a benchmark suite designed to evaluate the propensity of LLM a"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2506.04018","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-06-04T14:46:47Z","cross_cats_sorted":["cs.CL","cs.CY","cs.LG"],"title_canon_sha256":"89c507d37bb5e5d3d2a6d1cd9fbde08db83fffc0f3aa5eb22dc416468be6577b","abstract_canon_sha256":"72c1e83319f205daaee13ea82fa256d113eaaff6a0bc23fc47e3504527e4dfb4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T03:13:45.789443Z","signature_b64":"peqYH4+KB0Gfr3DQQIy+HB4p+AS8ZcBN2V7D6VUq3+cDCuHgHXdGps0jTLsp3ri5cs6kovDjPCShp2ay0gWRCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"43c495e2773803710520acb34dae665b5d771af3ccfab5b7bf0dbe8f152e98e8","last_reissued_at":"2026-06-23T03:13:45.788787Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T03:13:45.788787Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"AgentMisalignment: Measuring the Propensity for Misaligned Behaviour in LLM-Based Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL","cs.CY","cs.LG"],"primary_cat":"cs.AI","authors_text":"Akshat Naik, Edward James Young, Emma Goun\\'e, Francisco Javier Campos Zabala, Guillermo Bosch, Jason Ross Brown, Patrick Quinn","submitted_at":"2025-06-04T14:46:47Z","abstract_excerpt":"As Large Language Model (LLM) agents become more widespread, associated misalignment risks increase. While prior research has studied agents' ability to produce harmful outputs or follow malicious instructions, it remains unclear how likely agents are to spontaneously pursue unintended goals in realistic deployments. In this work, we approach misalignment as a conflict between the internal goals pursued by the model and the goals intended by its deployer. We introduce a misalignment propensity benchmark, \\textsc{AgentMisalignment}, a benchmark suite designed to evaluate the propensity of LLM a"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2506.04018","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2506.04018/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2506.04018","created_at":"2026-06-23T03:13:45.788843+00:00"},{"alias_kind":"arxiv_version","alias_value":"2506.04018v3","created_at":"2026-06-23T03:13:45.788843+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.04018","created_at":"2026-06-23T03:13:45.788843+00:00"},{"alias_kind":"pith_short_12","alias_value":"IPCJLYTXHABX","created_at":"2026-06-23T03:13:45.788843+00:00"},{"alias_kind":"pith_short_16","alias_value":"IPCJLYTXHABXCBJA","created_at":"2026-06-23T03:13:45.788843+00:00"},{"alias_kind":"pith_short_8","alias_value":"IPCJLYTX","created_at":"2026-06-23T03:13:45.788843+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"2605.22643","citing_title":"Boiling the Frog: A Multi-Turn Benchmark for Agentic Safety","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23565","citing_title":"Understanding Goal Generalisation in Sequential Reinforcement Learning","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22643","citing_title":"Boiling the Frog: A Multi-Turn Benchmark for Agentic Safety","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16282","citing_title":"Taxonomy and Consistency Analysis of Safety Benchmarks for AI Agents","ref_index":39,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN","json":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN.json","graph_json":"https://pith.science/api/pith-number/IPCJLYTXHABXCBJAVSZU3LTGLN/graph.json","events_json":"https://pith.science/api/pith-number/IPCJLYTXHABXCBJAVSZU3LTGLN/events.json","paper":"https://pith.science/paper/IPCJLYTX"},"agent_actions":{"view_html":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN","download_json":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN.json","view_paper":"https://pith.science/paper/IPCJLYTX","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2506.04018&json=true","fetch_graph":"https://pith.science/api/pith-number/IPCJLYTXHABXCBJAVSZU3LTGLN/graph.json","fetch_events":"https://pith.science/api/pith-number/IPCJLYTXHABXCBJAVSZU3LTGLN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN/action/storage_attestation","attest_author":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN/action/author_attestation","sign_citation":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN/action/citation_signature","submit_replication":"https://pith.science/pith/IPCJLYTXHABXCBJAVSZU3LTGLN/action/replication_record"}},"created_at":"2026-06-23T03:13:45.788843+00:00","updated_at":"2026-06-23T03:13:45.788843+00:00"}