{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:NEVAX5SDZXXLEG7I4U25MUOMTM","short_pith_number":"pith:NEVAX5SD","schema_version":"1.0","canonical_sha256":"692a0bf643cdeeb21be8e535d651cc9b0eecb559fef478e74b5f9fc3bdbc95d9","source":{"kind":"arxiv","id":"2606.21197","version":1},"attestation_state":"computed","paper":{"title":"Extraction and Analysis of Multimodal Concepts in Vision Language Models through Sparse Autoencoders","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Jae Hee Lee, Sergio Lanza, Stefan Wermter","submitted_at":"2026-06-19T08:08:43Z","abstract_excerpt":"Vision Language Models (VLMs) have demonstrated impressive performance in tasks requiring joint understanding of images and text, such as image captioning and Visual Question Answering (VQA), but our understanding of their internal processes remains limited. Recently, Sparse Autoencoders (SAEs) have emerged as a promising tool to support the interpretation of concepts encoded in VLMs. However, most SAE-based approaches focus only on textual or visual concepts separately, ignoring multimodal concepts.\n  This limitation hinders a comprehensive understanding of VLMs, since concepts that integrate"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.21197","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-19T08:08:43Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"d5ba30a2d712f54a17f5306dc9bb420524bde0088a8c795c1a6ff3101876c178","abstract_canon_sha256":"4b4e7ca17a0313d3fb9b68d0deb7d981d3026f5a98be20d35970e702bc528404"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T01:12:33.122216Z","signature_b64":"iVnkBVjuTTup6PiyxG1esKBQ5GCLeqdtrh7PAAfaU1O6YyR2kakvoChER1J9fLDI+sRjR16Mg3wDw/D+UbxOBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"692a0bf643cdeeb21be8e535d651cc9b0eecb559fef478e74b5f9fc3bdbc95d9","last_reissued_at":"2026-06-23T01:12:33.121738Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T01:12:33.121738Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Extraction and Analysis of Multimodal Concepts in Vision Language Models through Sparse Autoencoders","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Jae Hee Lee, Sergio Lanza, Stefan Wermter","submitted_at":"2026-06-19T08:08:43Z","abstract_excerpt":"Vision Language Models (VLMs) have demonstrated impressive performance in tasks requiring joint understanding of images and text, such as image captioning and Visual Question Answering (VQA), but our understanding of their internal processes remains limited. Recently, Sparse Autoencoders (SAEs) have emerged as a promising tool to support the interpretation of concepts encoded in VLMs. However, most SAE-based approaches focus only on textual or visual concepts separately, ignoring multimodal concepts.\n  This limitation hinders a comprehensive understanding of VLMs, since concepts that integrate"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.21197","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.21197/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.21197","created_at":"2026-06-23T01:12:33.121802+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.21197v1","created_at":"2026-06-23T01:12:33.121802+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.21197","created_at":"2026-06-23T01:12:33.121802+00:00"},{"alias_kind":"pith_short_12","alias_value":"NEVAX5SDZXXL","created_at":"2026-06-23T01:12:33.121802+00:00"},{"alias_kind":"pith_short_16","alias_value":"NEVAX5SDZXXLEG7I","created_at":"2026-06-23T01:12:33.121802+00:00"},{"alias_kind":"pith_short_8","alias_value":"NEVAX5SD","created_at":"2026-06-23T01:12:33.121802+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM","json":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM.json","graph_json":"https://pith.science/api/pith-number/NEVAX5SDZXXLEG7I4U25MUOMTM/graph.json","events_json":"https://pith.science/api/pith-number/NEVAX5SDZXXLEG7I4U25MUOMTM/events.json","paper":"https://pith.science/paper/NEVAX5SD"},"agent_actions":{"view_html":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM","download_json":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM.json","view_paper":"https://pith.science/paper/NEVAX5SD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.21197&json=true","fetch_graph":"https://pith.science/api/pith-number/NEVAX5SDZXXLEG7I4U25MUOMTM/graph.json","fetch_events":"https://pith.science/api/pith-number/NEVAX5SDZXXLEG7I4U25MUOMTM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM/action/storage_attestation","attest_author":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM/action/author_attestation","sign_citation":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM/action/citation_signature","submit_replication":"https://pith.science/pith/NEVAX5SDZXXLEG7I4U25MUOMTM/action/replication_record"}},"created_at":"2026-06-23T01:12:33.121802+00:00","updated_at":"2026-06-23T01:12:33.121802+00:00"}