{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:WHGWTFRFDI2AWRZHCUSOQEBAEM","short_pith_number":"pith:WHGWTFRF","schema_version":"1.0","canonical_sha256":"b1cd6996251a340b47271524e81020232c81bdcf72ad7614ef1c7ed175f774e5","source":{"kind":"arxiv","id":"2606.09735","version":1},"attestation_state":"computed","paper":{"title":"The Neutral Mask: How RLHF Provides Shallow Alignment while Leaving Partisan Structure Intact in a Large Language Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Wendy K. Tam","submitted_at":"2026-06-08T17:00:31Z","abstract_excerpt":"The ambition behind alignment training is to make large language models safe and useful. The primary mechanism, reinforcement learning from human feedback (RLHF), shapes the behavior of deployed language models by aligning them with ``human values.'' Yet the process is opaque. What values are being encoded; whose values are they; and how does RLHF encode them? A growing body of evidence suggests that RLHF produces only functional compliance rather than deep alignment. We offer a mechanistic case study of this phenomenon for partisan political orientation with a comparison of the internal repre"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.09735","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-08T17:00:31Z","cross_cats_sorted":[],"title_canon_sha256":"42037c903cc2b9f740580ad649823461ea95fc2df2a8daecd187c0441da21fda","abstract_canon_sha256":"7c20e29c68f14fbd5069087ee2cc8b26dd52969ea33d109632a8327d10394bf9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T02:09:06.767842Z","signature_b64":"cyOON/egGydoyLejt6ujwZPIhjispkjmis5DSe/E+qXzyZtd8nBkmImExlZgoEc77T3WA/5bSgH8xcEz2aYPBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b1cd6996251a340b47271524e81020232c81bdcf72ad7614ef1c7ed175f774e5","last_reissued_at":"2026-06-09T02:09:06.766904Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T02:09:06.766904Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"The Neutral Mask: How RLHF Provides Shallow Alignment while Leaving Partisan Structure Intact in a Large Language Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Wendy K. Tam","submitted_at":"2026-06-08T17:00:31Z","abstract_excerpt":"The ambition behind alignment training is to make large language models safe and useful. The primary mechanism, reinforcement learning from human feedback (RLHF), shapes the behavior of deployed language models by aligning them with ``human values.'' Yet the process is opaque. What values are being encoded; whose values are they; and how does RLHF encode them? A growing body of evidence suggests that RLHF produces only functional compliance rather than deep alignment. We offer a mechanistic case study of this phenomenon for partisan political orientation with a comparison of the internal repre"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.09735","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.09735/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.09735","created_at":"2026-06-09T02:09:06.767109+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.09735v1","created_at":"2026-06-09T02:09:06.767109+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.09735","created_at":"2026-06-09T02:09:06.767109+00:00"},{"alias_kind":"pith_short_12","alias_value":"WHGWTFRFDI2A","created_at":"2026-06-09T02:09:06.767109+00:00"},{"alias_kind":"pith_short_16","alias_value":"WHGWTFRFDI2AWRZH","created_at":"2026-06-09T02:09:06.767109+00:00"},{"alias_kind":"pith_short_8","alias_value":"WHGWTFRF","created_at":"2026-06-09T02:09:06.767109+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM","json":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM.json","graph_json":"https://pith.science/api/pith-number/WHGWTFRFDI2AWRZHCUSOQEBAEM/graph.json","events_json":"https://pith.science/api/pith-number/WHGWTFRFDI2AWRZHCUSOQEBAEM/events.json","paper":"https://pith.science/paper/WHGWTFRF"},"agent_actions":{"view_html":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM","download_json":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM.json","view_paper":"https://pith.science/paper/WHGWTFRF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.09735&json=true","fetch_graph":"https://pith.science/api/pith-number/WHGWTFRFDI2AWRZHCUSOQEBAEM/graph.json","fetch_events":"https://pith.science/api/pith-number/WHGWTFRFDI2AWRZHCUSOQEBAEM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM/action/storage_attestation","attest_author":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM/action/author_attestation","sign_citation":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM/action/citation_signature","submit_replication":"https://pith.science/pith/WHGWTFRFDI2AWRZHCUSOQEBAEM/action/replication_record"}},"created_at":"2026-06-09T02:09:06.767109+00:00","updated_at":"2026-06-09T02:09:06.767109+00:00"}