{"total":20,"items":[{"citing_arxiv_id":"2605.13328","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"What Limits Vision-and-Language Navigation ?","primary_cat":"cs.RO","submitted_at":"2026-05-13T10:41:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StereoNav reaches new benchmark highs on R2R-CE and RxR-CE and improves real-robot reliability by supplying persistent target-location priors and stereo-derived geometry that stay stable under lighting changes and blur.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08412","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SYNCR: A Cross-Video Reasoning Benchmark with Synthetic Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-08T19:20:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SYNCR benchmark shows leading MLLMs reach only 52.5% average accuracy on cross-video reasoning tasks against an 89.5% human baseline, with major weaknesses in physical and spatial reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07355","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TTF: Temporal Token Fusion for Efficient Video-Language Model","primary_cat":"cs.CV","submitted_at":"2026-05-08T07:08:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TTF fuses temporally redundant visual tokens via local similarity search in a plug-and-play way, cutting ~67% tokens on Qwen3-VL-8B while retaining 99.5% accuracy with minimal overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01662","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Video Active Perception: Effective Inference-Time Long-Form Video Understanding with Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-03T01:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VAP is a training-free active-perception method that improves zero-shot long-form video QA performance and frame efficiency up to 5.6x in VLMs by selecting keyframes that differ from priors generated by a text-conditioned video model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23198","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StoryTR: Narrative-Centric Video Temporal Retrieval with Theory of Mind Reasoning","primary_cat":"cs.AI","submitted_at":"2026-04-25T08:09:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StoryTR is a new benchmark and agentic data pipeline that adds explicit Theory of Mind reasoning chains to train smaller video retrieval models, yielding a 15% relative IoU gain over larger baselines on narrative content.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16893","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"EasyVideoR1: Easier RL for Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-18T07:56:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EasyVideoR1 delivers an optimized RL pipeline for video understanding in large vision-language models, achieving 1.47x throughput gains and aligned results on 22 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14149","ref_index":80,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"One Token per Highly Selective Frame: Towards Extreme Compression for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-15T17:59:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"XComp reaches extreme video compression (one token per selective frame) via learnable progressive token compression and question-conditioned frame selection, lifting LVBench accuracy from 42.9 percent to 46.2 percent after tuning on 2.5 percent of standard data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10060","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Mosaic: Cross-Modal Clustering for Efficient Video Understanding","primary_cat":"cs.PF","submitted_at":"2026-04-11T06:54:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mosaic uses cross-modal clusters as the unit for KVCache organization in VLMs to achieve up to 1.38x speedup in streaming long-video understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08077","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AdaSpark: Adaptive Sparsity for Efficient Long-Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-09T10:48:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AdaSpark delivers up to 57% FLOP reduction in Video-LLMs for long videos through adaptive cube- and token-level sparsity without apparent loss in performance on hour-scale benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05079","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SVAgent: Storyline-Guided Long Video Understanding via Cross-Modal Multi-Agent Collaboration","primary_cat":"cs.CV","submitted_at":"2026-04-06T18:30:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SVAgent improves long video question answering by constructing storylines via multi-agent collaboration and aligning cross-modal predictions for more robust, human-like reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.18265","ref_index":187,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","primary_cat":"cs.CV","submitted_at":"2025-08-25T17:58:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVL3.5 advances open-source multimodal models with Cascade RL for +16% reasoning gains and ViR for 4x inference speedup, with the 241B model reaching SOTA among open-source MLLMs on multimodal, reasoning, and agentic tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"understanding [5, 81, 140, 147, 180, 187], often comes with ever increasing computational costs, which have become a crucial bottleneck of real-world applications. In this work, we introduce InternVL3.5, an advanced family of InternVL series [13, 14, 15, 37, 79, 80, 187] with stronger capabilities in versatility, reasoning, and efficiency. Compared to InternVL3 [187], InternVL3.5 achieves superior performance through our proposed Cascade RL framework, which enhances reasoning capabilities in an efficient, scalable, and stable manner. Cascade RL consists of two complementary substages: an offline RL stage [142], which efficiently achieves satisfactory performance, and an online RL stage [183], which carefully refines the output distribution and further push the performance upper bound of the model."},{"citing_arxiv_id":"2505.07062","ref_index":179,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Seed1.5-VL Technical Report","primary_cat":"cs.CV","submitted_at":"2025-05-11T17:28:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Seed1.5-VL is a compact multimodal model that sets new records on dozens of vision-language benchmarks and outperforms prior systems on agent-style tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Gonzalez, and Ion Stoica. Judging llm-as-a-judge with mt-bench and chatbot arena, 2023. URLhttps://arxiv.org/abs/2306.05685. [178] Junjie Zhou, Yan Shu, Bo Zhao, Boya Wu, Shitao Xiao, Xi Yang, Yongping Xiong, Bo Zhang, Tiejun Huang, and Zheng Liu. Mlvu: A comprehensive benchmark for multi-task long video understanding.arXiv preprint arXiv:2406.04264, 2024. [179] Yiyang Zhou, Chenhang Cui, Rafael Rafailov, Chelsea Finn, and Huaxiu Yao. Aligning modalities in vision large language models via preference fine-tuning.arXiv preprint arXiv:2402.11411, 2024. [180] Daniel M Ziegler, Nisan Stiennon, Jeffrey Wu, Tom B Brown, Alec Radford, Dario Amodei, Paul Christiano, and Geoffrey Irving. Fine-tuning language models from human preferences."},{"citing_arxiv_id":"2504.10479","ref_index":155,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","primary_cat":"cs.CV","submitted_at":"2025-04-14T17:59:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVL3-78B sets a new open-source SOTA of 72.2 on MMMU via native joint multimodal pre-training, V2PE, MPO, and test-time scaling while remaining competitive with proprietary models.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"InternVL series. 3.2 Multimodal Reasoning and Mathematics To comprehensively evaluate the multimodal reasoning and mathematical capabilities of InternVL3, we conduct experiments on a series of benchmarks, including MMMU [141] for multidisciplinary reasoning, MathVista [80], MathVision [119], MathVerse [146] for mathematical reasoning, as well as DynaMath [155], WeMath [99] and LogicVista [131] for complementary evaluation on logical reasoning. As shown in Table 2, InternVL3 exhibits strong performance across all tested benchmarks. Specifically, on the MMMU benchmark, InternVL3-based models consistently outperform smaller-scale competitors. For instance, with increasing model size, InternVL3-78B reaches a score over 72 on MMMU, indicating robust understanding"},{"citing_arxiv_id":"2504.05299","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SmolVLM: Redefining small and efficient multimodal models","primary_cat":"cs.AI","submitted_at":"2025-04-07T17:58:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SmolVLM-256M outperforms a 300-times larger model using under 1 GB GPU memory, while the 2.2B version matches state-of-the-art VLMs at half the memory cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.05236","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unified Reward Model for Multimodal Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2025-03-07T08:36:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UnifiedReward is the first unified reward model that jointly assesses multimodal understanding and generation to provide better preference signals for aligning vision models via DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.13826","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos","primary_cat":"cs.CV","submitted_at":"2025-01-23T16:51:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Video-MMMU benchmark shows large multimodal models exhibit steep performance drops on higher cognitive tasks when learning from professional videos and lag significantly behind humans in knowledge acquisition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.13106","ref_index":133,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding","primary_cat":"cs.CV","submitted_at":"2025-01-22T18:59:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VideoLLaMA3 uses a vision-centric training paradigm and token-reduction design to reach competitive results on image and video benchmarks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"We also run evaluations on MMVU [131] which includes both the task types mentioned above. Long Video Understanding. To further examine the capacity of VideoLLaMA3 to process and comprehend long-form video content, we assess performance on three long-video understanding (LVU) benchmarks: (1) MLVU [132]: diverse long-video understanding tasks for videos ranging from 3 minutes to more than 2 hours, (2) LongVideoBench [133]: video reasoning over the referred context within long video-language interleaved inputs, and (3) LVBench [134]: extreme long video understanding. Video Temporal Reasoning. To assess the temporal awareness and reasoning capabilities of VideoLLaMA3, we conduct evaluations on the following tasks: (1) Temporal Perception and Reasoning tasks, including"},{"citing_arxiv_id":"2410.02713","ref_index":208,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLaVA-Video: Video Instruction Tuning With Synthetic Data","primary_cat":"cs.CV","submitted_at":"2024-10-03T17:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLaVA-Video-178K is a new synthetic video instruction dataset that, when combined with existing data to train LLaVA-Video, produces strong results on video understanding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.03326","ref_index":170,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLaVA-OneVision: Easy Visual Task Transfer","primary_cat":"cs.CV","submitted_at":"2024-08-06T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLaVA-OneVision is the first single open LMM to simultaneously achieve strong performance in single-image, multi-image, and video scenarios with cross-scenario transfer capabilities.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"ActivityNetQA [155] Spatio-Temporal Reasoning 50.5 % 56.6 % 62.3 % 57.0 % - EgoSchema [98] Egocentric Video Understanding 26.8 % 60.1 % 62.0 % - - PerceptionTest [115] Perception and Reasoning 49.2 % 57.1 % 66.9 % - - SeedBench [66] (video) Multi-discip; Video 44.2 % 56.9 % 62.1 % 60.5 % - LongVideoBench [138] (val) Long Video 45.8 % 56.3 % 63.2 % 60.7 % 66.7 % MLVU [170] Long Video Understanding 50.3 % 64.7 % 68.0 % 49.2 % 64.6 % MVBench [71] Multi-discip 45.5 % 56.7 % 59.4 % 43.5 % - VideoChatGPT [97] Video Conversation 3.12 3.49 3.62 4.06 - VideoMME [29] Multi-discip 44.0 % 58.2 % 66.2 % 59.9 % 71.9 % Table 2: Performance comparison to state-of-the-art commercial models with our LLaV A-OneVision models (0.5B to 72B parameters) across diverse evaluation benchmarks spanning multiple modalities."},{"citing_arxiv_id":"2406.07476","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs","primary_cat":"cs.CV","submitted_at":"2024-06-11T17:22:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VideoLLaMA 2 improves video LLMs via a new STC connector for spatial-temporal dynamics and joint audio training, reaching competitive results on video QA and captioning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}