{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:FIGDRADZACIXODEOA27ECC3M5V","short_pith_number":"pith:FIGDRADZ","schema_version":"1.0","canonical_sha256":"2a0c3880790091770c8e06be410b6ced7290f1dd6c347abb0678b3be193f7b78","source":{"kind":"arxiv","id":"2605.19792","version":1},"attestation_state":"computed","paper":{"title":"Mechanisms of Object Localization in Vision-Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Gemma Roig, Martina G. Vilas, Timothy Schauml\\\"offel","submitted_at":"2026-05-19T12:56:32Z","abstract_excerpt":"Visually-grounded language models (VLMs) are highly effective in linking visual and textual information, yet they often struggle with basic classification and localization tasks. While classification mechanisms have been studied more extensively, the processes that support object localization remain poorly understood. In this work, we investigate two representative families, LLaVA-1.5 and InternVL-3.5, using a suite of mechanistic interpretability tools, including token ablations, attention knockout, and causal mediation analysis. We find that localization is driven by a containerization mecha"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.19792","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-19T12:56:32Z","cross_cats_sorted":[],"title_canon_sha256":"2a5f9862eb5496445685e38a03b2c9caa9f1c14f61d43859d0e2032ddcc8665e","abstract_canon_sha256":"02ad13f9187d0f702711251542f2349c5c92a340048ff9e15f3627e179e08ad2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:06:14.263873Z","signature_b64":"LBXwGbglTWjX3oGrXmwG661W91t33gWHtXvmwEsoNyHtuxxiaRO8uYWte28sbtfWfKsSutcRkhAXDxR7H7hmBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2a0c3880790091770c8e06be410b6ced7290f1dd6c347abb0678b3be193f7b78","last_reissued_at":"2026-05-20T01:06:14.262886Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:06:14.262886Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Mechanisms of Object Localization in Vision-Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Gemma Roig, Martina G. Vilas, Timothy Schauml\\\"offel","submitted_at":"2026-05-19T12:56:32Z","abstract_excerpt":"Visually-grounded language models (VLMs) are highly effective in linking visual and textual information, yet they often struggle with basic classification and localization tasks. While classification mechanisms have been studied more extensively, the processes that support object localization remain poorly understood. In this work, we investigate two representative families, LLaVA-1.5 and InternVL-3.5, using a suite of mechanistic interpretability tools, including token ablations, attention knockout, and causal mediation analysis. We find that localization is driven by a containerization mecha"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.19792","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.19792/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.19792","created_at":"2026-05-20T01:06:14.263031+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.19792v1","created_at":"2026-05-20T01:06:14.263031+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.19792","created_at":"2026-05-20T01:06:14.263031+00:00"},{"alias_kind":"pith_short_12","alias_value":"FIGDRADZACIX","created_at":"2026-05-20T01:06:14.263031+00:00"},{"alias_kind":"pith_short_16","alias_value":"FIGDRADZACIXODEO","created_at":"2026-05-20T01:06:14.263031+00:00"},{"alias_kind":"pith_short_8","alias_value":"FIGDRADZ","created_at":"2026-05-20T01:06:14.263031+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V","json":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V.json","graph_json":"https://pith.science/api/pith-number/FIGDRADZACIXODEOA27ECC3M5V/graph.json","events_json":"https://pith.science/api/pith-number/FIGDRADZACIXODEOA27ECC3M5V/events.json","paper":"https://pith.science/paper/FIGDRADZ"},"agent_actions":{"view_html":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V","download_json":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V.json","view_paper":"https://pith.science/paper/FIGDRADZ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.19792&json=true","fetch_graph":"https://pith.science/api/pith-number/FIGDRADZACIXODEOA27ECC3M5V/graph.json","fetch_events":"https://pith.science/api/pith-number/FIGDRADZACIXODEOA27ECC3M5V/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V/action/storage_attestation","attest_author":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V/action/author_attestation","sign_citation":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V/action/citation_signature","submit_replication":"https://pith.science/pith/FIGDRADZACIXODEOA27ECC3M5V/action/replication_record"}},"created_at":"2026-05-20T01:06:14.263031+00:00","updated_at":"2026-05-20T01:06:14.263031+00:00"}