{"total":25,"items":[{"citing_arxiv_id":"2605.12481","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ToolCUA: Towards Optimal GUI-Tool Path Orchestration for Computer Use Agents","primary_cat":"cs.AI","submitted_at":"2026-05-12T17:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToolCUA introduces a trajectory scaling pipeline and staged RL to optimize GUI-tool switching, reaching 46.85% accuracy on OSWorld-MCP for a 66% relative gain over baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12374","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-12T16:41:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GAP aligns visual latent reasoning in MLLMs at feature, context, and capacity levels, yielding the best aggregate perception and reasoning scores on Qwen2.5-VL 7B among supervised variants while providing task-relevant visual signal at inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11856","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniVLR: Unifying Text and Vision in Visual Latent Reasoning for Multimodal LLMs","primary_cat":"cs.CV","submitted_at":"2026-05-12T09:40:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniVLR unifies textual and visual reasoning in multimodal LLMs by compressing reasoning traces and auxiliary images into visual latent tokens for direct inference without interleaved text CoT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09860","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When to Re-Commit: Temporal Abstraction Discovery for Long-Horizon Vision-Language Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-11T01:43:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"State-conditioned commitment depth in a vision-language policy Pareto-dominates fixed-depth baselines on Sliding Puzzle and Sokoban, raising solve rates by up to 12.5 points while using 25% fewer actions and beating larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08735","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CollabVR: Collaborative Video Reasoning with Vision-Language and Video Generation Models","primary_cat":"cs.CV","submitted_at":"2026-05-09T06:39:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CollabVR improves video reasoning performance by coupling vision-language models and video generation models in a closed-loop step-level collaboration that detects and repairs generation failures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07106","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Retrieve, Integrate, and Synthesize: Spatial-Semantic Grounded Latent Visual Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-08T01:33:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RIS improves MLLM latent visual reasoning by retrieving spatial-semantic evidence, integrating it via attention bottlenecks, and synthesizing it with language transition tokens, yielding gains on V*, HRBench, MMVP, and BLINK benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07019","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LensVLM: Selective Context Expansion for Compressed Visual Representation of Text","primary_cat":"cs.CV","submitted_at":"2026-05-07T23:03:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LensVLM trains VLMs to scan compressed rendered text images and selectively expand task-relevant regions, achieving 4.3x compression with near full-text accuracy and outperforming baselines up to 10.1x on text QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06121","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Pest-Thinker: Learning to Think and Reason like Entomologists via Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2026-05-07T12:30:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pest-Thinker is a reinforcement learning framework that improves MLLMs' expert-level reasoning on pest morphology via synthesized CoT trajectories, GRPO optimization, and an LLM-judged feature reward on new benchmarks QFSD and AgriInsect.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02630","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AutoFocus: Uncertainty-Aware Active Visual Search for GUI Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-04T14:18:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AutoFocus converts token perplexity into an anisotropic Gaussian uncertainty field to drive region proposals and shape-aware zooming for improved GUI grounding in VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02378","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Enhancing Multimodal In-Context Learning via Inductive-Deductive Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-04T09:18:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A framework with similarity-based visual token compression, dynamic attention rebalancing, and explicit inductive-deductive chain-of-thought improves multimodal ICL performance across eight benchmarks for open-source VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00814","ref_index":64,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Persistent Visual Memory: Sustaining Perception for Deep Generation in LVLMs","primary_cat":"cs.CV","submitted_at":"2026-05-01T17:54:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PVM adds a parallel branch to LVLMs that directly supplies visual embeddings to prevent attention decay over long generated sequences, yielding accuracy gains on reasoning tasks with minimal overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00444","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scaling Video Understanding via Compact Latent Multi-Agent Collaboration","primary_cat":"cs.CV","submitted_at":"2026-05-01T06:24:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MACF decouples agent perception budgets from overall video length using latent token collaboration to scale video understanding in MLLMs beyond current limits.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22245","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Listening with Time: Precise Temporal Awareness for Long-Form Audio Understanding","primary_cat":"eess.AS","submitted_at":"2026-04-24T05:40:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LAT-Audio introduces a global-to-local reasoning approach with TWA-CoT that outperforms prior models on temporal tasks for audio up to 30 minutes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21409","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"S1-VL: Scientific Multimodal Reasoning Model with Thinking-with-Images","primary_cat":"cs.CV","submitted_at":"2026-04-23T08:23:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"S1-VL combines structured scientific reasoning with iterative image manipulation via code execution to reach state-of-the-art results on visual and scientific reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20806","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OMIBench: Benchmarking Olympiad-Level Multi-Image Reasoning in Large Vision-Language Model","primary_cat":"cs.CV","submitted_at":"2026-04-22T17:37:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OMIBench benchmark reveals that current LVLMs achieve at most 50% on Olympiad problems requiring reasoning across multiple images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20570","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Exploring Spatial Intelligence from a Generative Perspective","primary_cat":"cs.CV","submitted_at":"2026-04-22T13:50:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Fine-tuning multimodal models on a new synthetic spatial benchmark improves generative spatial compliance on real and synthetic tasks and transfers to better spatial understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16858","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Q-DeepSight: Incentivizing Thinking with Images for Image Quality Assessment and Refinement","primary_cat":"cs.CV","submitted_at":"2026-04-18T06:10:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Q-DeepSight proposes a think-with-image multimodal CoT framework trained via RL with perceptual curriculum rewards and evidence gradient filtering to achieve SOTA IQA performance and enable training-free perceptual refinement in image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12890","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Long-horizon Agentic Multimodal Search","primary_cat":"cs.CV","submitted_at":"2026-04-14T15:40:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LMM-Searcher uses file-based visual UIDs and a fetch tool plus 12K synthesized trajectories to fine-tune a multimodal agent that scales to 100-turn horizons and reaches SOTA among open-source models on MM-BrowseComp and MMSearch-Plus.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11025","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Test-time Scaling over Perception: Resolving the Grounding Paradox in Thinking with Images","primary_cat":"cs.CV","submitted_at":"2026-04-13T05:49:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TTSP resolves the Grounding Paradox by treating perception as a scalable test-time process that generates, filters, and iteratively refines multiple visual exploration traces, outperforming baselines on high-resolution and multimodal reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08879","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GRASP: Grounded CoT Reasoning with Dual-Stage Optimization for Multimodal Sarcasm Target Identification","primary_cat":"cs.CL","submitted_at":"2026-04-10T02:38:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRASP improves multimodal sarcasm target identification by anchoring visual regions in grounded chain-of-thought reasoning and using dual-stage optimization on a new balanced dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08545","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Act Wisely: Cultivating Meta-Cognitive Tool Use in Agentic Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HDPO reframes tool efficiency as a conditional objective within accurate trajectories, enabling Metis to reduce tool invocations by orders of magnitude while raising reasoning accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06777","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Walk the Talk: Bridging the Reasoning-Action Gap for Thinking with Images via Multimodal Agentic Policy Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-08T07:48:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAPO improves multimodal chain-of-thought reasoning by requiring explicit textual descriptions of visual tool results and using a novel advantage estimator that combines semantic alignment with task rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04500","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Saliency-R1: Enforcing Interpretable and Faithful Vision-language Reasoning via Saliency-map Alignment Reward","primary_cat":"cs.CV","submitted_at":"2026-04-06T07:51:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Saliency-R1 uses a novel saliency map technique and GRPO with human bounding-box overlap as reward to improve VLM reasoning faithfulness and interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04017","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoBrowse: A Geolocation Benchmark for Agentic Tool Use with Expert-Annotated Reasoning Traces","primary_cat":"cs.CL","submitted_at":"2026-04-05T08:29:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GeoBrowse is a two-level geolocation benchmark combining visual cue composition with knowledge-intensive multi-hop queries, paired with the GATE agent workflow that outperforms no-tool, search-only, and image-only baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03660","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TableVision: A Large-Scale Benchmark for Spatially Grounded Reasoning over Complex Hierarchical Tables","primary_cat":"cs.AI","submitted_at":"2026-04-04T09:26:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TableVision benchmark shows explicit spatial grounding recovers MLLM reasoning on hierarchical tables, delivering 12.3% accuracy improvement through a decoupled perception-reasoning framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}