{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:ASOSH5O4COI5T7JTL2PJNQ2KKB","short_pith_number":"pith:ASOSH5O4","schema_version":"1.0","canonical_sha256":"049d23f5dc1391d9fd335e9e96c34a504cfaad669efd46fe7758cd1e2b42f82a","source":{"kind":"arxiv","id":"2512.15134","version":2},"attestation_state":"computed","paper":{"title":"From Isolation to Entanglement: When Do Interpretability Methods Identify and Disentangle Known Concepts?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Aaron Mueller, Andrew Lee, Dhanya Sridhar, Ekdeep Singh Lubana, Patrik Reizinger, Shruti Joshi","submitted_at":"2025-12-17T06:54:08Z","abstract_excerpt":"A goal of interpretability is to recover disentangled representations of latent concepts (features) from the activations of neural networks. The quality of features is typically evaluated in isolation, and under implicit independence assumptions that may not hold in practice. Thus, it is unclear to what extent common featurization methods such as sparse autoencoders (SAEs) and probes disentangle one concept from another. We propose a multi-concept evaluation setting using concepts including sentiment, domain, voice, and tense. We evaluate how well featurizers produce disentangled representatio"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.15134","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-12-17T06:54:08Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"6b0194b57b8c4a0e03864a56d9870660e9692b5660b0e3782987e25f2c67bf93","abstract_canon_sha256":"7c7bcf313bc17798175b3bb03ff5cdd9e95e67fa570a0c53c6ee81c6d128a23c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-12T01:08:19.671523Z","signature_b64":"v+6kA0+mJ/U70bGA7kL/VI1OvuxMEuSUtzOwNBoHJEVVRFbU3Y2uwuE2PwOb8vETLkAOzBZjMGbykHOX8UpYDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"049d23f5dc1391d9fd335e9e96c34a504cfaad669efd46fe7758cd1e2b42f82a","last_reissued_at":"2026-06-12T01:08:19.670409Z","signature_status":"signed_v1","first_computed_at":"2026-06-12T01:08:19.670409Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"From Isolation to Entanglement: When Do Interpretability Methods Identify and Disentangle Known Concepts?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Aaron Mueller, Andrew Lee, Dhanya Sridhar, Ekdeep Singh Lubana, Patrik Reizinger, Shruti Joshi","submitted_at":"2025-12-17T06:54:08Z","abstract_excerpt":"A goal of interpretability is to recover disentangled representations of latent concepts (features) from the activations of neural networks. The quality of features is typically evaluated in isolation, and under implicit independence assumptions that may not hold in practice. Thus, it is unclear to what extent common featurization methods such as sparse autoencoders (SAEs) and probes disentangle one concept from another. We propose a multi-concept evaluation setting using concepts including sentiment, domain, voice, and tense. We evaluate how well featurizers produce disentangled representatio"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.15134","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.15134/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.15134","created_at":"2026-06-12T01:08:19.670573+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.15134v2","created_at":"2026-06-12T01:08:19.670573+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.15134","created_at":"2026-06-12T01:08:19.670573+00:00"},{"alias_kind":"pith_short_12","alias_value":"ASOSH5O4COI5","created_at":"2026-06-12T01:08:19.670573+00:00"},{"alias_kind":"pith_short_16","alias_value":"ASOSH5O4COI5T7JT","created_at":"2026-06-12T01:08:19.670573+00:00"},{"alias_kind":"pith_short_8","alias_value":"ASOSH5O4","created_at":"2026-06-12T01:08:19.670573+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2605.22531","citing_title":"Disentanglement Beyond Generative Models with Riemannian ICA","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09967","citing_title":"Tensor Product Representation Probes Reveal Shared Structure Across Linear Directions","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05030","citing_title":"Phase-Associative Memory: Sequence Modeling in Complex Hilbert Space","ref_index":57,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB","json":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB.json","graph_json":"https://pith.science/api/pith-number/ASOSH5O4COI5T7JTL2PJNQ2KKB/graph.json","events_json":"https://pith.science/api/pith-number/ASOSH5O4COI5T7JTL2PJNQ2KKB/events.json","paper":"https://pith.science/paper/ASOSH5O4"},"agent_actions":{"view_html":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB","download_json":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB.json","view_paper":"https://pith.science/paper/ASOSH5O4","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.15134&json=true","fetch_graph":"https://pith.science/api/pith-number/ASOSH5O4COI5T7JTL2PJNQ2KKB/graph.json","fetch_events":"https://pith.science/api/pith-number/ASOSH5O4COI5T7JTL2PJNQ2KKB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB/action/storage_attestation","attest_author":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB/action/author_attestation","sign_citation":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB/action/citation_signature","submit_replication":"https://pith.science/pith/ASOSH5O4COI5T7JTL2PJNQ2KKB/action/replication_record"}},"created_at":"2026-06-12T01:08:19.670573+00:00","updated_at":"2026-06-12T01:08:19.670573+00:00"}