{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:WCZCW6CGUOLLC5I6HYOF7TDI5X","short_pith_number":"pith:WCZCW6CG","schema_version":"1.0","canonical_sha256":"b0b22b7846a396b1751e3e1c5fcc68ede9e8841e21e0449badf7ef3201478b77","source":{"kind":"arxiv","id":"2605.13080","version":1},"attestation_state":"computed","paper":{"title":"Learning to See What You Need: Gaze Attention for Multimodal Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Multimodal LLMs can match or exceed full dense attention by dynamically restricting focus to a small number of task-relevant gaze regions and using up to 90 percent fewer visual key-value entries.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Byeongho Heo, Dongyoon Han, Geonmo Gu, Jaegul Choo, Junha Song, Sangdoo Yun","submitted_at":"2026-05-13T06:54:09Z","abstract_excerpt":"When humans describe a visual scene, they do not process the entire image uniformly; instead, they selectively fixate on regions relevant to their intended description. In contrast, current multimodal large language models (MLLMs) attend to all visual tokens at each generation step, leading to diluted focus and unnecessary computational overhead. In this work, we introduce Gaze Attention, a novel mechanism that enables MLLMs to selectively attend to task-relevant visual regions during generation. Specifically, we spatially group visual embeddings-stored as key-value caches-into compact gaze re"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2605.13080","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T06:54:09Z","cross_cats_sorted":[],"title_canon_sha256":"b87ba34be4779e971d0208fedc4745687a3fcfe54e350e1e099b5b6c8fe747f8","abstract_canon_sha256":"d5061722eb07690452f1db69d5249046c32d2804a577bf13591dc33afc8bd85e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:08:58.706530Z","signature_b64":"XxS51XXkV4ggZ2loboyX27S/V0UczfTiH+1/kDEcUnR0qxNmba475GeaTr9JqbGp19PB1P3F4WZugREcQi8wCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b0b22b7846a396b1751e3e1c5fcc68ede9e8841e21e0449badf7ef3201478b77","last_reissued_at":"2026-05-18T03:08:58.705781Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:08:58.705781Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Learning to See What You Need: Gaze Attention for Multimodal Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Multimodal LLMs can match or exceed full dense attention by dynamically restricting focus to a small number of task-relevant gaze regions and using up to 90 percent fewer visual key-value entries.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Byeongho Heo, Dongyoon Han, Geonmo Gu, Jaegul Choo, Junha Song, Sangdoo Yun","submitted_at":"2026-05-13T06:54:09Z","abstract_excerpt":"When humans describe a visual scene, they do not process the entire image uniformly; instead, they selectively fixate on regions relevant to their intended description. In contrast, current multimodal large language models (MLLMs) attend to all visual tokens at each generation step, leading to diluted focus and unnecessary computational overhead. In this work, we introduce Gaze Attention, a novel mechanism that enables MLLMs to selectively attend to task-relevant visual regions during generation. Specifically, we spatially group visual embeddings-stored as key-value caches-into compact gaze re"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Gaze Attention matches or surpasses dense-attention baselines, while using up to 90% fewer visual KV entries in the attention computation.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That spatially grouping visual embeddings into compact gaze regions, dynamically selecting them via lightweight descriptors, and appending learnable context tokens is sufficient to preserve all task-critical information without performance loss.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Gaze Attention groups visual embeddings into selectable regions and dynamically restricts attention to task-relevant ones, matching dense baselines with up to 90% fewer visual KV entries via added context tokens.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Multimodal LLMs can match or exceed full dense attention by dynamically restricting focus to a small number of task-relevant gaze regions and using up to 90 percent fewer visual key-value entries.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"baf235a64af321b271a91cf2848977661d22ccf008c00eda2368a80cd7de658d"},"source":{"id":"2605.13080","kind":"arxiv","version":1},"verdict":{"id":"8039f2d3-1736-470c-9e77-4a4dceb70bbd","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T20:09:46.446658Z","strongest_claim":"Gaze Attention matches or surpasses dense-attention baselines, while using up to 90% fewer visual KV entries in the attention computation.","one_line_summary":"Gaze Attention groups visual embeddings into selectable regions and dynamically restricts attention to task-relevant ones, matching dense baselines with up to 90% fewer visual KV entries via added context tokens.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That spatially grouping visual embeddings into compact gaze regions, dynamically selecting them via lightweight descriptors, and appending learnable context tokens is sufficient to preserve all task-critical information without performance loss.","pith_extraction_headline":"Multimodal LLMs can match or exceed full dense attention by dynamically restricting focus to a small number of task-relevant gaze regions and using up to 90 percent fewer visual key-value entries."},"references":{"count":194,"sample":[{"doi":"","year":null,"title":"Visual Instruction Tuning , author=. NeurIPS , year=","work_id":"33367c0f-af90-4a3a-9451-241f0933bca8","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Improved baselines with visual instruction tuning , author=. CVPR , year=","work_id":"00fda102-447d-45fb-bd35-f1175364de39","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large Multimodal Models","work_id":"746ec475-051a-45ef-a6ed-2de8118466e4","ref_index":3,"cited_arxiv_id":"2407.07895","is_internal_anchor":true},{"doi":"","year":null,"title":"Llava-onevision: Easy visual task transfer , author=. TMLR , year=","work_id":"d74fca3b-7e11-491c-9bdd-d88919233f24","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Learning transferable visual models from natural language supervision , author=. ICML , year=","work_id":"bccf13fa-668d-4445-905c-f275e43b4e6c","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":194,"snapshot_sha256":"22f2654608ff53b62187e2a89b8c46d74bc1fe83ab9330ef71166f8dd5ae36fd","internal_anchors":40},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e980bb96ce847e15aa9ec5b8b216cbf77fdb22c0a78bce39f840e21546ab7aaf"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13080","created_at":"2026-05-18T03:08:58.705893+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13080v1","created_at":"2026-05-18T03:08:58.705893+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13080","created_at":"2026-05-18T03:08:58.705893+00:00"},{"alias_kind":"pith_short_12","alias_value":"WCZCW6CGUOLL","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"WCZCW6CGUOLLC5I6","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"WCZCW6CG","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X","json":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X.json","graph_json":"https://pith.science/api/pith-number/WCZCW6CGUOLLC5I6HYOF7TDI5X/graph.json","events_json":"https://pith.science/api/pith-number/WCZCW6CGUOLLC5I6HYOF7TDI5X/events.json","paper":"https://pith.science/paper/WCZCW6CG"},"agent_actions":{"view_html":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X","download_json":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X.json","view_paper":"https://pith.science/paper/WCZCW6CG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13080&json=true","fetch_graph":"https://pith.science/api/pith-number/WCZCW6CGUOLLC5I6HYOF7TDI5X/graph.json","fetch_events":"https://pith.science/api/pith-number/WCZCW6CGUOLLC5I6HYOF7TDI5X/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X/action/storage_attestation","attest_author":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X/action/author_attestation","sign_citation":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X/action/citation_signature","submit_replication":"https://pith.science/pith/WCZCW6CGUOLLC5I6HYOF7TDI5X/action/replication_record"}},"created_at":"2026-05-18T03:08:58.705893+00:00","updated_at":"2026-05-18T03:08:58.705893+00:00"}