{"total":11,"items":[{"citing_arxiv_id":"2605.07817","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GazeVLM: Active Vision via Internal Attention Control for Multimodal Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-08T14:49:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GazeVLM introduces internal gaze tokens that allow VLMs to dynamically suppress irrelevant visual features and simulate foveal attention for improved high-resolution multimodal reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24583","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Improving Vision-language Models with Perception-centric Process Reward Models","primary_cat":"cs.CV","submitted_at":"2026-04-27T15:08:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Perceval is a perception-centric PRM that detects token-level perceptual errors in VLMs, supporting token-advantage RL training and iterative test-time scaling for improved reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22875","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SketchVLM: Vision language models can annotate images to explain thoughts and guide users","primary_cat":"cs.CV","submitted_at":"2026-04-23T22:33:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SketchVLM lets VLMs generate non-destructive SVG annotations on input images to visually explain answers, raising visual reasoning accuracy by up to 28.5 points and annotation quality by 1.48x over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18562","ref_index":213,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AnchorSeg: Language Grounded Query Banks for Reasoning Segmentation","primary_cat":"cs.CV","submitted_at":"2026-04-20T17:49:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AnchorSeg uses ordered query banks of latent reasoning tokens plus a spatial anchor token and a Token-Mask Cycle Consistency loss to achieve 67.7% gIoU and 68.1% cIoU on the ReasonSeg benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10219","ref_index":82,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Cognitive Pivot Points and Visual Anchoring: Unveiling and Rectifying Hallucinations in Multimodal Reasoning Models","primary_cat":"cs.AI","submitted_at":"2026-04-11T13:59:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Multimodal reasoning models hallucinate at high-entropy cognitive bifurcation points due to loss of visual semantic anchoring, and the V-STAR training paradigm with HVAR rewards and FRM reflection mitigates this by reinforcing visual attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09167","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MAG-3D: Multi-Agent Grounded Reasoning for 3D Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-10T09:51:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAG-3D is a training-free multi-agent framework that coordinates planning, grounding, and coding agents with off-the-shelf VLMs to achieve grounded 3D reasoning and state-of-the-art benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08456","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Entropy-Gradient Grounding: Training-Free Evidence Retrieval in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-09T16:51:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Entropy-gradient grounding uses model uncertainty to retrieve evidence regions in VLMs, improving performance on detail-critical and compositional tasks across multiple architectures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08065","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Multimodal Latent Reasoning via Predictive Embeddings","primary_cat":"cs.LG","submitted_at":"2026-04-09T10:27:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pearl learns predictive embeddings from multimodal tool trajectories in latent space to enable efficient reasoning that matches or exceeds supervised fine-tuning and reconstruction-based methods without explicit tool invocation at inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06912","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Q-Zoom: Query-Aware Adaptive Perception for Efficient Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-08T10:12:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Q-Zoom achieves up to 4.39x inference speedup in high-resolution MLLM scenarios via query-aware gating and region localization, matching or exceeding baseline accuracy on document and high-res benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.07062","ref_index":149,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Seed1.5-VL Technical Report","primary_cat":"cs.CV","submitted_at":"2025-05-11T17:28:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Seed1.5-VL is a compact multimodal model that sets new records on dozens of vision-language benchmarks and outperforms prior systems on agent-style tasks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Results are shown in tables 8 and 9, where we report Seed1.5-VL's performance under the thinking mode. GUI Grounding. GUI grounding refers to the model's ability to understand and localize interface elements-a fundamental skill for vision-based agents. We evaluate this capability on ScreenSpot Pro [72], which focuses on expert-annotated tasks in professional settings, and ScreenSpot v2 [149], which covers grounding across 25 Capability Benchmark Seed1.5-VL thinking Seed1.5-VL non-thinking Prior SOTA Short video MotionBench [48] 68.4 68.4 62.8 GLM-4V MVBench [73] 74.4 74.3 76.4 InternVL-2.5 TOMATO [117] 44.7 44.2 46.9∗ Gemini 2.5 Pro TVBench [19] 63.6 61.5 62.6∗ Gemini 2.5 Pro Dream-1K [139] 43.9 42.6 42.0 Tarsier2 TempCompass [82] 83."},{"citing_arxiv_id":"2407.07726","ref_index":147,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaliGemma: A versatile 3B VLM for transfer","primary_cat":"cs.CV","submitted_at":"2024-07-10T14:57:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PaliGemma is an open 3B VLM based on SigLIP and Gemma that achieves strong performance on nearly 40 diverse open-world tasks including benchmarks, remote-sensing, and segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}