{"total":17,"items":[{"citing_arxiv_id":"2605.12481","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ToolCUA: Towards Optimal GUI-Tool Path Orchestration for Computer Use Agents","primary_cat":"cs.AI","submitted_at":"2026-05-12T17:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToolCUA introduces a trajectory scaling pipeline and staged RL to optimize GUI-tool switching, reaching 46.85% accuracy on OSWorld-MCP for a 66% relative gain over baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10172","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"V-ABS: Action-Observer Driven Beam Search for Dynamic Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-11T08:21:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"V-ABS is an action-observer beam search method with entropy-based adaptive weighting and an 80k-sample SFT dataset that delivers 19.7% average gains on visual reasoning tasks for MLLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02730","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Perceptual Flow Network for Visually Grounded Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-04T15:31:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PFlowNet decouples perception from reasoning, integrates multi-dimensional rewards with vicinal geometric shaping via variational RL, and reports new SOTA results on V* Bench (90.6%) and MME-RealWorld-lite (67.0%).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02378","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Enhancing Multimodal In-Context Learning via Inductive-Deductive Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-04T09:18:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A framework with similarity-based visual token compression, dynamic attention rebalancing, and explicit inductive-deductive chain-of-thought improves multimodal ICL performance across eight benchmarks for open-source VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02130","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Where Things Are to What They Are For: Benchmarking Spatial-Functional Intelligence in Multimodal LLMs","primary_cat":"cs.CV","submitted_at":"2026-05-04T01:19:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SFI-Bench shows current multimodal LLMs struggle to integrate spatial memory with functional reasoning and external knowledge in video tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20146","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SAKE: Self-aware Knowledge Exploitation-Exploration for Grounded Multimodal Named Entity Recognition","primary_cat":"cs.IR","submitted_at":"2026-04-22T03:17:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SAKE is an agentic framework for GMNER that uses uncertainty-based self-awareness and reinforcement learning to balance internal knowledge exploitation with adaptive external exploration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19264","ref_index":82,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DR-MMSearchAgent: Deepening Reasoning in Multimodal Search Agents","primary_cat":"cs.CV","submitted_at":"2026-04-21T09:28:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DR-MMSearchAgent derives batch-wide trajectory advantages and uses differentiated Gaussian rewards to prevent premature collapse in multimodal agents, outperforming MMSearch-R1 by 8.4% on FVQA-test.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18320","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EVE: Verifiable Self-Evolution of MLLMs via Executable Visual Transformations","primary_cat":"cs.CV","submitted_at":"2026-04-20T14:20:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"EVE enables verifiable self-evolution of MLLMs by using a Challenger-Solver architecture to generate dynamic executable visual transformations that produce VQA problems with absolute execution-verified ground truth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16858","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Q-DeepSight: Incentivizing Thinking with Images for Image Quality Assessment and Refinement","primary_cat":"cs.CV","submitted_at":"2026-04-18T06:10:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Q-DeepSight proposes a think-with-image multimodal CoT framework trained via RL with perceptual curriculum rewards and evidence gradient filtering to achieve SOTA IQA performance and enable training-free perceptual refinement in image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14029","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"POINTS-Seeker: Towards Training a Multimodal Agentic Search Model from Scratch","primary_cat":"cs.CV","submitted_at":"2026-04-15T16:09:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"POINTS-Seeker-8B is an 8B multimodal model trained from scratch for agentic search that uses seeding and visual-space history folding to outperform prior models on six visual reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12890","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Long-horizon Agentic Multimodal Search","primary_cat":"cs.CV","submitted_at":"2026-04-14T15:40:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LMM-Searcher uses file-based visual UIDs and a fetch tool plus 12K synthesized trajectories to fine-tune a multimodal agent that scales to 100-turn horizons and reaches SOTA among open-source models on MM-BrowseComp and MMSearch-Plus.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08545","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Act Wisely: Cultivating Meta-Cognitive Tool Use in Agentic Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HDPO reframes tool efficiency as a conditional objective within accurate trajectories, enabling Metis to reduce tool invocations by orders of magnitude while raising reasoning accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08539","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OpenVLThinkerV2: A Generalist Multimodal Reasoning Model for Multi-domain Visual Tasks","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OpenVLThinkerV2 applies a new Gaussian GRPO training objective with response and entropy shaping to outperform prior open-source and proprietary models on 18 visual reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07900","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AnomalyAgent: Agentic Industrial Anomaly Synthesis via Tool-Augmented Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2026-04-09T07:17:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AnomalyAgent uses tool-augmented reinforcement learning with self-reflection to generate realistic industrial anomalies, achieving better metrics than zero-shot methods on MVTec-AD.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06777","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Walk the Talk: Bridging the Reasoning-Action Gap for Thinking with Images via Multimodal Agentic Policy Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-08T07:48:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAPO improves multimodal chain-of-thought reasoning by requiring explicit textual descriptions of visual tool results and using a novel advantage estimator that combines semantic alignment with task rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03660","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TableVision: A Large-Scale Benchmark for Spatially Grounded Reasoning over Complex Hierarchical Tables","primary_cat":"cs.AI","submitted_at":"2026-04-04T09:26:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TableVision benchmark shows explicit spatial grounding recovers MLLM reasoning on hierarchical tables, delivering 12.3% accuracy improvement through a decoupled perception-reasoning framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02794","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CharTool: Tool-Integrated Visual Reasoning for Chart Understanding","primary_cat":"cs.AI","submitted_at":"2026-04-03T07:02:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CharTool equips MLLMs with cropping and code tools plus agentic RL on DuoChart data to raise chart-reasoning accuracy by up to 9.78 percent on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}