{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:E47PO7DKKC6L4TXC722UOYQWYD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"38a4aa43ee0c482cd8359e4cb2f19cc5baf3af1556bd16839fa03b30f8790762","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-11T11:19:03Z","title_canon_sha256":"00793d84fef45565cf4a29d94d7fc2f91518d84b5e545caf5ddacdc29806f9d4"},"schema_version":"1.0","source":{"id":"2606.13209","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.13209","created_at":"2026-06-12T01:09:46Z"},{"alias_kind":"arxiv_version","alias_value":"2606.13209v1","created_at":"2026-06-12T01:09:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.13209","created_at":"2026-06-12T01:09:46Z"},{"alias_kind":"pith_short_12","alias_value":"E47PO7DKKC6L","created_at":"2026-06-12T01:09:46Z"},{"alias_kind":"pith_short_16","alias_value":"E47PO7DKKC6L4TXC","created_at":"2026-06-12T01:09:46Z"},{"alias_kind":"pith_short_8","alias_value":"E47PO7DK","created_at":"2026-06-12T01:09:46Z"}],"graph_snapshots":[{"event_id":"sha256:d0723cafa46275ef2d41b85b5a2f8b5d34bca24efd29fb7c1927cf5703f661aa","target":"graph","created_at":"2026-06-12T01:09:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.13209/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reward models are a key component of reinforcement learning from human feedback (RLHF), aligning language models toward both helpful and harmless behaviour. However, the internal mechanisms underlying these objectives and their conflicts remain poorly understood. We study alignment tension in reward models trained under helpfulness-only, harmlessness-only, and mixed-objective settings. We find that mixed-objective models often underperform single-objective models, indicating interference between objectives. Using activation-based methods, we identify neurons associated with each objective and ","authors_text":"Eshaan Tanwar, Pepa Atanasova","cross_cats":["cs.CL"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-11T11:19:03Z","title":"Understanding helpfulness and harmless tension in reward models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.13209","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ff4cced3256a2dac26464d46666393216527defbcc366674f07222110bb79c54","target":"record","created_at":"2026-06-12T01:09:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"38a4aa43ee0c482cd8359e4cb2f19cc5baf3af1556bd16839fa03b30f8790762","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-11T11:19:03Z","title_canon_sha256":"00793d84fef45565cf4a29d94d7fc2f91518d84b5e545caf5ddacdc29806f9d4"},"schema_version":"1.0","source":{"id":"2606.13209","kind":"arxiv","version":1}},"canonical_sha256":"273ef77c6a50bcbe4ee2feb5476216c0d1c259843e4f140c3c02f580a299085d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"273ef77c6a50bcbe4ee2feb5476216c0d1c259843e4f140c3c02f580a299085d","first_computed_at":"2026-06-12T01:09:46.870180Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-12T01:09:46.870180Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"roKX12qLxe54jtp8XMQQCA/bISs8xeEYSB+WTnFCpq1AuQV1xSXgcVEP/fUDypll6PEBji56ZNGzFjLsg8mUDQ==","signature_status":"signed_v1","signed_at":"2026-06-12T01:09:46.870816Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.13209","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ff4cced3256a2dac26464d46666393216527defbcc366674f07222110bb79c54","sha256:d0723cafa46275ef2d41b85b5a2f8b5d34bca24efd29fb7c1927cf5703f661aa"],"state_sha256":"8a937c8a6d8b3f6ee31de4a82906d35a5ce6d2acf89d6b294c138859269aa04e"}