{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:QEC4DBZTVZ5F5NMQ4GJM37YFOX","short_pith_number":"pith:QEC4DBZT","schema_version":"1.0","canonical_sha256":"8105c18733ae7a5eb590e192cdff0575c8d543360f290fe158914b29a95e08ae","source":{"kind":"arxiv","id":"2606.08243","version":1},"attestation_state":"computed","paper":{"title":"Building Comparative Motivation Profiles with Instrumental Interventions","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"David Vella Zarb, Jinghua Ou, Rustem Turtayev, Shi Feng, Taywon Min","submitted_at":"2026-06-06T16:01:52Z","abstract_excerpt":"Safety evaluations often infer latent motivations from behavioral patterns, but the construct validity of these inferences is unclear. We study this problem in alignment faking, where models comply with training objectives more often when they infer training pressure. This behavior is commonly interpreted as strategic self-preservation, but it may also reflect sensitivity to the model's inference about the expectation of researchers conducting the evaluation. We introduce a symmetric intervention framework for distinguishing these competing hypotheses. Instead of directly intervening on \"schem"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.08243","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-06T16:01:52Z","cross_cats_sorted":[],"title_canon_sha256":"5ee9322d6bc95a770281af83a48d1e918b494576db253158b71069387cd4d02c","abstract_canon_sha256":"7814bd1356a0adbbb58f88a0ce4c8dbf35c4477d1dcb2b1bbae7cd37f88eec99"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T01:05:31.222273Z","signature_b64":"FCfxmiGd3kSL+VNBgY2zEvyGV/UtxxKQWVzr6GP0gknM4S4iC94xOMSQvkwkOzrhcTUEkR9RZvrmCQ4T7HLqCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8105c18733ae7a5eb590e192cdff0575c8d543360f290fe158914b29a95e08ae","last_reissued_at":"2026-06-09T01:05:31.221848Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T01:05:31.221848Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Building Comparative Motivation Profiles with Instrumental Interventions","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"David Vella Zarb, Jinghua Ou, Rustem Turtayev, Shi Feng, Taywon Min","submitted_at":"2026-06-06T16:01:52Z","abstract_excerpt":"Safety evaluations often infer latent motivations from behavioral patterns, but the construct validity of these inferences is unclear. We study this problem in alignment faking, where models comply with training objectives more often when they infer training pressure. This behavior is commonly interpreted as strategic self-preservation, but it may also reflect sensitivity to the model's inference about the expectation of researchers conducting the evaluation. We introduce a symmetric intervention framework for distinguishing these competing hypotheses. Instead of directly intervening on \"schem"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.08243","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.08243/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.08243","created_at":"2026-06-09T01:05:31.221915+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.08243v1","created_at":"2026-06-09T01:05:31.221915+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.08243","created_at":"2026-06-09T01:05:31.221915+00:00"},{"alias_kind":"pith_short_12","alias_value":"QEC4DBZTVZ5F","created_at":"2026-06-09T01:05:31.221915+00:00"},{"alias_kind":"pith_short_16","alias_value":"QEC4DBZTVZ5F5NMQ","created_at":"2026-06-09T01:05:31.221915+00:00"},{"alias_kind":"pith_short_8","alias_value":"QEC4DBZT","created_at":"2026-06-09T01:05:31.221915+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX","json":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX.json","graph_json":"https://pith.science/api/pith-number/QEC4DBZTVZ5F5NMQ4GJM37YFOX/graph.json","events_json":"https://pith.science/api/pith-number/QEC4DBZTVZ5F5NMQ4GJM37YFOX/events.json","paper":"https://pith.science/paper/QEC4DBZT"},"agent_actions":{"view_html":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX","download_json":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX.json","view_paper":"https://pith.science/paper/QEC4DBZT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.08243&json=true","fetch_graph":"https://pith.science/api/pith-number/QEC4DBZTVZ5F5NMQ4GJM37YFOX/graph.json","fetch_events":"https://pith.science/api/pith-number/QEC4DBZTVZ5F5NMQ4GJM37YFOX/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX/action/timestamp_anchor","attest_storage":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX/action/storage_attestation","attest_author":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX/action/author_attestation","sign_citation":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX/action/citation_signature","submit_replication":"https://pith.science/pith/QEC4DBZTVZ5F5NMQ4GJM37YFOX/action/replication_record"}},"created_at":"2026-06-09T01:05:31.221915+00:00","updated_at":"2026-06-09T01:05:31.221915+00:00"}