{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ZKQYAIYYBY36EK4TASZHRCSJTW","short_pith_number":"pith:ZKQYAIYY","schema_version":"1.0","canonical_sha256":"caa18023180e37e22b9304b2788a499d975e82980cd94bd7543d11a9f685b848","source":{"kind":"arxiv","id":"2606.19552","version":1},"attestation_state":"computed","paper":{"title":"LaViSA: A Language and Vision Structural Ambiguity Benchmark","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Koichiro Yoshino, Lee Sangmyeong, Shun Inadumi","submitted_at":"2026-06-17T19:51:00Z","abstract_excerpt":"Structural ambiguity arises when a single sentence admits multiple valid interpretations due to its syntactic structure, posing a fundamental challenge for language understanding. Visual scenes serve as useful cues for resolving such ambiguity, and Vision and Language Models (VLMs) need to be capable of deriving possible semantic interpretations from visual scenes. We introduce Language and Vision Structural Ambiguity (LaViSA), a benchmark designed to evaluate the ability of VLMs to resolve structural ambiguity leveraging visual scenes. LaViSA consists of ambiguous sentences, their disambiguat"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.19552","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-17T19:51:00Z","cross_cats_sorted":[],"title_canon_sha256":"d73f3db9788b448f8dde63341b2c1ca7df638d1ab02f0387687db84b29250151","abstract_canon_sha256":"7ce8960395b23a357802e620dc2c769b36a4383df282fed38fce0cef703bddf1"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:12:28.752444Z","signature_b64":"Zpb03Vw2NVNmdlvbS4t9K8fYi890Jcl2RfJrVu0uP4YHY4Ed7J5cp7BuozkeflpFdvf8DguTjMA+d2Zp3Gt/BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"caa18023180e37e22b9304b2788a499d975e82980cd94bd7543d11a9f685b848","last_reissued_at":"2026-06-19T16:12:28.752074Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:12:28.752074Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LaViSA: A Language and Vision Structural Ambiguity Benchmark","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Koichiro Yoshino, Lee Sangmyeong, Shun Inadumi","submitted_at":"2026-06-17T19:51:00Z","abstract_excerpt":"Structural ambiguity arises when a single sentence admits multiple valid interpretations due to its syntactic structure, posing a fundamental challenge for language understanding. Visual scenes serve as useful cues for resolving such ambiguity, and Vision and Language Models (VLMs) need to be capable of deriving possible semantic interpretations from visual scenes. We introduce Language and Vision Structural Ambiguity (LaViSA), a benchmark designed to evaluate the ability of VLMs to resolve structural ambiguity leveraging visual scenes. LaViSA consists of ambiguous sentences, their disambiguat"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.19552","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.19552/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.19552","created_at":"2026-06-19T16:12:28.752137+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.19552v1","created_at":"2026-06-19T16:12:28.752137+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.19552","created_at":"2026-06-19T16:12:28.752137+00:00"},{"alias_kind":"pith_short_12","alias_value":"ZKQYAIYYBY36","created_at":"2026-06-19T16:12:28.752137+00:00"},{"alias_kind":"pith_short_16","alias_value":"ZKQYAIYYBY36EK4T","created_at":"2026-06-19T16:12:28.752137+00:00"},{"alias_kind":"pith_short_8","alias_value":"ZKQYAIYY","created_at":"2026-06-19T16:12:28.752137+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW","json":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW.json","graph_json":"https://pith.science/api/pith-number/ZKQYAIYYBY36EK4TASZHRCSJTW/graph.json","events_json":"https://pith.science/api/pith-number/ZKQYAIYYBY36EK4TASZHRCSJTW/events.json","paper":"https://pith.science/paper/ZKQYAIYY"},"agent_actions":{"view_html":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW","download_json":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW.json","view_paper":"https://pith.science/paper/ZKQYAIYY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.19552&json=true","fetch_graph":"https://pith.science/api/pith-number/ZKQYAIYYBY36EK4TASZHRCSJTW/graph.json","fetch_events":"https://pith.science/api/pith-number/ZKQYAIYYBY36EK4TASZHRCSJTW/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW/action/storage_attestation","attest_author":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW/action/author_attestation","sign_citation":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW/action/citation_signature","submit_replication":"https://pith.science/pith/ZKQYAIYYBY36EK4TASZHRCSJTW/action/replication_record"}},"created_at":"2026-06-19T16:12:28.752137+00:00","updated_at":"2026-06-19T16:12:28.752137+00:00"}