{"total":21,"items":[{"citing_arxiv_id":"2606.00579","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sandboxed Coding Agents are Competitive Omni-modal Task Solvers","primary_cat":"cs.CL","submitted_at":"2026-05-30T07:04:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sandboxed coding agents with text+image access match or outperform native omnimodal models on audio-video benchmarks by converting tasks into code-driven retrieval and processing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28741","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Self-Prophetic Decoding to Unlock Visual Search in LVLMs","primary_cat":"cs.CV","submitted_at":"2026-05-27T17:01:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SeProD is a plug-and-play self-prophetic decoding framework that combines pre- and post-training LVLM capabilities via probability-based sampling to improve coherent visual search and multi-step reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27959","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ROVER: Routing Object-Centric Visual Evidence for Grounded Multi-Image Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-27T04:52:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ROVER introduces a learnable routing plugin for object-centric visual evidence in MLLMs via token triplets and differential attention, reporting gains on MM-GCoT and VideoEspresso when integrated into Qwen2.5-VL-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20743","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Draw2Think: Harnessing Geometry Reasoning through Constraint Engine Interaction","primary_cat":"cs.CV","submitted_at":"2026-05-20T05:46:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Draw2Think recasts geometric reasoning as agentic interaction with a constraint engine, achieving 95.9% predicate-level construction fidelity and up to 16.4% accuracy gains on solid geometry tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15181","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Plans to Pixels: Learning to Plan and Orchestrate for Open-Ended Image Editing","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A planner-orchestrator system learns long-horizon image editing by maximizing outcome-based rewards from a vision-language judge and refining plans from successful trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09860","ref_index":22,"ref_count":3,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When to Re-Commit: Temporal Abstraction Discovery for Long-Horizon Vision-Language Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-11T01:43:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Learns state-conditioned commitment depth in a 7B vision-language policy that jointly predicts actions and replan intervals, outperforming fixed-depth baselines and larger models on Sliding Puzzle and Sokoban while providing a theoretical dominance result.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[20] Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Liang Wang, Weizhu Chen, et al. Lora: Low-rank adaptation of large language models.Iclr, 1(2):3, 2022. [21] Yushi Hu, Weijia Shi, Xingyu Fu, Dan Roth, Mari Ostendorf, Luke Zettlemoyer, Noah A Smith, and Ranjay Krishna. Visual sketchpad: Sketching as a visual chain of thought for multimodal language models, 2024. URLhttps://arxiv.org/abs/2406.09403. [22] Physical Intelligence, Kevin Black, Noah Brown, James Darpinian, Karan Dhabalia, Danny Driess, Adnan Esmail, Michael Equi, Chelsea Finn, Niccolo Fusai, Manuel Y . Galliker, Dibya Ghosh, Lachy Groom, Karol Hausman, Brian Ichter, Szymon Jakubczak, Tim Jones, Liyiming Ke, Devin LeBlanc, Sergey Levine, Adrian Li-Bell, Mohith Mothukuri, Suraj Nair, Karl Pertsch,"},{"citing_arxiv_id":"2605.03950","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UnAC: Adaptive Visual Prompting with Abstraction and Stepwise Checking for Complex Multimodal Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-05T16:36:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"UnAC improves LMM performance on visual reasoning benchmarks by combining adaptive visual prompting, image abstraction, and gradual self-checking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19945","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Visual Reasoning through Tool-supervised Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2026-04-21T19:48:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToolsRL trains MLLMs via a tool-specific then accuracy-focused RL curriculum to master visual tools for complex reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17800","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReFineVLA: Multimodal Reasoning-Aware Generalist Robotic Policies via Teacher-Guided Fine-Tuning","primary_cat":"cs.RO","submitted_at":"2026-04-20T04:46:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReFineVLA adds teacher-generated reasoning steps to VLA training and reports state-of-the-art success rates on SimplerEnv WidowX and Google Robot benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09167","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MAG-3D: Multi-Agent Grounded Reasoning for 3D Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-10T09:51:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAG-3D is a training-free multi-agent framework that coordinates planning, grounding, and coding agents with off-the-shelf VLMs to achieve grounded 3D reasoning and state-of-the-art benchmark results.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ing tool invocations and their returned observations to support multi-step tool- augmented reasoning [11]. More broadly, such designs follow the general line of tool-using and memory-aware agent frameworks for interleaving reasoning with actions and managing intermediate context [37,48]. In 2D-VLMs, tool usage is often framed as a cycle of active perception, where models iteratively \"ob- serve-think-act\" [16,19,26,40,43-45,55]. Extending this framework to video, tool-augmented LVLMs have been developed to externalize spatiotemporal ev- idence through modules such as tracking, temporal grounding, and video seg- mentation [13,14,50,51,53]. These modules help capture visual scenes, enabling models to reason across spatial dimensions in 2D and both spatial and tempo- ral dimensions in video."},{"citing_arxiv_id":"2512.12623","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reasoning Within the Mind: Dynamic Multimodal Interleaving in Latent Space","primary_cat":"cs.CV","submitted_at":"2025-12-14T10:07:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DMLR performs dynamic visual-textual interleaving in latent space using confidence-guided latent policy gradient optimization and a dynamic visual injection strategy, yielding improved multimodal reasoning on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.10941","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mull-Tokens: Modality-Agnostic Latent Thinking","primary_cat":"cs.CV","submitted_at":"2025-12-11T18:59:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mull-Tokens are modality-agnostic latent tokens that enable free-form multimodal thinking and deliver up to 16% gains on spatial reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.24251","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2025-09-29T03:52:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Latent Visual Reasoning enables autoregressive generation of latent visual states that reconstruct critical image tokens, yielding gains on perception-heavy VQA benchmarks such as 71.67% on MMVP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.01955","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How Well Does GPT-4o Understand Vision? Evaluating Multimodal Foundation Models on Standard Computer Vision Tasks","primary_cat":"cs.CV","submitted_at":"2025-07-02T17:59:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multimodal foundation models achieve respectable but sub-specialist performance on semantic vision tasks and weaker results on geometric tasks when evaluated through prompt chaining on established benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.06211","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PuzzleWorld: A Benchmark for Multimodal, Open-Ended Reasoning in Puzzlehunts","primary_cat":"cs.CL","submitted_at":"2025-06-06T16:17:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PuzzleWorld benchmark reveals state-of-the-art AI models solve only 18% of complex puzzlehunt problems with 40% stepwise accuracy, matching novices but trailing enthusiasts, while fine-tuning on traces yields modest gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.23678","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Grounded Reinforcement Learning for Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2025-05-29T17:20:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViGoRL introduces visually grounded RL that anchors reasoning steps to image coordinates and uses multi-turn zooming to outperform standard RL and supervised baselines on spatial and GUI reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.22020","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CoT-VLA: Visual Chain-of-Thought Reasoning for Vision-Language-Action Models","primary_cat":"cs.CV","submitted_at":"2025-03-27T22:23:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoT-VLA is a 7B VLA that generates future visual frames autoregressively as planning goals before actions, outperforming prior VLAs by 17% on real-world tasks and 6% in simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.17352","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OpenVLThinker: Complex Vision-Language Reasoning via Iterative SFT-RL Cycles","primary_cat":"cs.CV","submitted_at":"2025-03-21T17:52:43+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Iterative SFT-RL cycles enable a 7B LVLM to develop sophisticated visual chain-of-thought reasoning and improve performance on math and general reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.12605","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Multimodal Chain-of-Thought Reasoning: A Comprehensive Survey","primary_cat":"cs.CV","submitted_at":"2025-03-16T18:39:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"The paper provides the first comprehensive survey of multimodal chain-of-thought reasoning, including foundational concepts, a taxonomy of methodologies, application analyses, challenges, and future directions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"IPVR [51]; Multimodal-CoT [29]; CoT-PT [52]; PromptCoT [53]; VCoT [54];PCoT [55]; MM-CoT [56]; HoT [57]; CoTDet [58]; DDCoT [59]; CPSeg [60];Gen2Sim [61]; CoI [62]; MC-CoT [63]; CCoT [64]; LoT [65]; DPMM-CoT [66];GCoT [23]; CoCoT [67]; KAM-CoT [68]; PKRD-CoT [69]; CoS [70]; CoA [71];Det-CoT [72]; BDoG [73]; TextCoT [74]; CoRAG [75]; Cantor [76];Visual Sketchpad [77]; IoT [78]; PS-CoT [79]; G-CoT [80]; STIC [81];SNSE-CoT [82]; CoE [83]; DCoT [84]; Layoutllm-t2i [85]; Creatilayout [86];visual-o1 [87]; R-CoT [88]; LLaV A-CoT [9]; VIC [89]; RelationLMM [90];Insight-V [91]; LLaV A-Aurora [92]; AR-MCTS [93]; Mulberry [94]; Virgo [95];Socratic [96]; LlamaV-o1 [97]; MV oT [30]; PARM++ [34]; URSA [98];Multimodal Open R1 [99]; AStar [100]; R1-OneVision [101]; SoT [102]"},{"citing_arxiv_id":"2501.07542","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Imagine while Reasoning in Space: Multimodal Visualization-of-Thought","primary_cat":"cs.CL","submitted_at":"2025-01-13T18:23:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MVoT lets multimodal models create coherent images during chain-of-thought reasoning via a token discrepancy loss, yielding competitive or better results than text-only CoT on dynamic spatial tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.04509","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ErrorRadar: Benchmarking Complex Mathematical Reasoning of Multimodal Large Language Models Via Error Detection","primary_cat":"cs.CL","submitted_at":"2024-10-06T14:59:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ErrorRadar is a new benchmark of 2,500 multimodal K-12 math problems for MLLM error step identification and categorization, where GPT-4o trails human experts by ~10%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}