{"total":33,"items":[{"citing_arxiv_id":"2605.13375","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GRIP-VLM: Group-Relative Importance Pruning for Efficient Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-13T11:32:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRIP-VLM applies group-relative policy optimization via reinforcement learning to prune visual tokens in VLMs, yielding up to 15% inference speedup at matched accuracy over prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10622","ref_index":70,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Vocabulary Hijacking in LVLMs: Unveiling Critical Attention Heads by Excluding Inert Tokens to Mitigate Hallucination","primary_cat":"cs.MM","submitted_at":"2026-05-11T14:16:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LVLMs show vocabulary hijacking by inert tokens that decode to hijacking anchors; HABI locates them, NHAR finds resilient heads, and HAVAE boosts those heads to cut hallucinations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08816","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Mirror, Mirror on the Wall: Can VLM Agents Tell Who They Are at All?","primary_cat":"cs.AI","submitted_at":"2026-05-09T09:10:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stronger VLM agents use mirror reflections for self-identification in controlled 3D tests, while weaker ones inspect but fail to extract or correctly attribute self-relevant information.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07817","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GazeVLM: Active Vision via Internal Attention Control for Multimodal Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-08T14:49:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GazeVLM introduces internal gaze tokens that allow VLMs to dynamically suppress irrelevant visual features and simulate foveal attention for improved high-resolution multimodal reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05899","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VisMMOE: Exploiting Visual-Expert Affinity for Efficient Visual-Language MoE Offloading","primary_cat":"cs.LG","submitted_at":"2026-05-07T09:11:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VisMMoE exploits visual-expert affinity via token pruning to achieve up to 2.68x faster VL-MoE inference on memory-constrained hardware while keeping accuracy competitive.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05810","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CXR-ContraBench: Benchmarking Negated-Option Attraction in Medical VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-07T07:46:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Medical VLMs frequently select negated options that contradict visible chest X-ray findings, achieving only ~30% accuracy on direct presence probes, but a post-hoc consistency verifier raises accuracy above 95%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25072","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Accuracy: Benchmarking Cross-Task Consistency in Unified Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-04-27T23:57:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"XTC-Bench reveals that strong performance on generation or understanding tasks in unified multimodal models does not guarantee cross-task semantic consistency, which instead depends on how tightly coupled the learning objectives are across modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23813","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ShredBench: Evaluating the Semantic Reasoning Capabilities of Multimodal LLMs in Document Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-26T17:26:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ShredBench shows state-of-the-art MLLMs perform well on intact documents but suffer sharp drops in restoration accuracy as fragmentation increases to 8-16 pieces, indicating insufficient cross-modal semantic reasoning for VRDU.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21523","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Seeing Isn't Believing: Uncovering Blind Spots in Evaluator Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-23T10:36:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Evaluator VLMs frequently fail to detect quality-degrading perturbations in I2T and T2I outputs, with failure rates exceeding 50% in some cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18347","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Multilingual Training and Evaluation Resources for Vision-Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-20T14:42:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Releases regenerated multilingual training data and translated benchmarks for VLMs in five languages and demonstrates consistent benefits from multilingual training over English-only baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05225","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MACS: Modality-Aware Capacity Scaling for Efficient Multimodal MoE Inference","primary_cat":"cs.LG","submitted_at":"2026-04-19T07:25:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MACS improves inference speed in multimodal MoE models by entropy-weighted balancing of visual tokens and real-time modality-adaptive expert capacity allocation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12213","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Modality-Native Routing in Agent-to-Agent Networks: A Multimodal A2A Protocol Extension","primary_cat":"cs.AI","submitted_at":"2026-04-14T02:44:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Modality-native routing in A2A networks raises task accuracy from 32% to 52% over text-bottleneck baselines on a 50-task benchmark, but only when paired with capable downstream reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09749","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"See Fair, Speak Truth: Equitable Attention Improves Grounding and Reduces Hallucination in Vision-Language Alignment","primary_cat":"cs.CV","submitted_at":"2026-04-10T11:01:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Equitable attention via Dominant Object Penalty and Outlier Boost Coefficient reduces object hallucinations in multimodal LLMs without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05265","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Semantic Reality: Interactive Context-Aware Visualization of Inter-Object Relationships in Augmented Reality","primary_cat":"cs.HC","submitted_at":"2026-04-06T23:57:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Semantic Reality maintains a persistent connectivity graph of objects in AR via multimodal reasoning and action recognition, then visualizes relationships to aid understanding and task guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04780","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CLEAR: Unlocking Generative Potential for Degraded Image Understanding in Unified Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T15:54:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLEAR uses degradation-aware fine-tuning, a latent representation bridge, and interleaved reinforcement learning to connect generative and reasoning capabilities in multimodal models for better degraded image understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.27259","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing the Scene Matters: Revealing Forgetting in Video Understanding Models with a Scene-Aware Long-Video Benchmark","primary_cat":"cs.CV","submitted_at":"2026-03-28T12:44:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SceneBench shows VLMs lose accuracy on scene-level questions in long videos due to forgetting, and Scene-RAG retrieval improves performance by 2.5%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.18265","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","primary_cat":"cs.CV","submitted_at":"2025-08-25T17:58:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVL3.5 advances open-source multimodal models with Cascade RL for +16% reasoning gains and ViR for 4x inference speedup, with the 241B model reaching SOTA among open-source MLLMs on multimodal, reasoning, and agentic tasks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"9 65.9 65.3 65.2 64.4 63.6 62.2 60.8 59.4 58.8 56.9 56.5 55.0 53.8 50.9 50.7 47.3 46.4 44.9 42.7 Figure 1: Comparison between InternVL3.5 and leading MLLMs in general capabilities. Hatched bars represent closed-source commercial models. We report average scores on a set of multimodal general, reasoning, text, and agentic benchmarks: MMBench v1.1 (en) [ 71], MMStar [11], BLINK [36], HallusionBench [41], AI2D [55], OCRBench [72], MMVet [168], MME-RealWorld (en) [178], MVBench [63], VideoMME [35], MMMU [170], MathVista [76], MathVision [134], MathVerse [175], DynaMath [189], WeMath [100], Log- icVista [153], MATH500 [45], AIME24 [84], AIME25 [85], GPQA [106], MMLU-Pro [146], GAOKAO [177], IFEval [185], SGP-Bench [102], VSI-Bench [161], ERQA [121], SpaCE-10 [38], and OmniSpatial [50]."},{"citing_arxiv_id":"2507.01006","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2025-07-01T17:55:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GLM-4.5V reaches state-of-the-art results on 42 multimodal benchmarks among open-source models of similar size by applying reinforcement learning with curriculum sampling to a strong vision foundation model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.09568","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BLIP3-o: A Family of Fully Open Unified Multimodal Models-Architecture, Training and Dataset","primary_cat":"cs.CV","submitted_at":"2025-05-14T17:11:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BLIP3-o uses a diffusion transformer to generate CLIP image features and a sequential pretraining strategy to build open models that perform strongly on both image understanding and generation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.10479","ref_index":75,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","primary_cat":"cs.CV","submitted_at":"2025-04-14T17:59:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVL3-78B sets a new open-source SOTA of 72.2 on MMMU via native joint multimodal pre-training, V2PE, MPO, and test-time scaling while remaining competitive with proprietary models.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"83 77.6 89.0 56.4 InternVL2.5-78B [18] 2494.5 88.3 / 88.5 87.4 72.3 65.5 69.5 79.2 57.4 3.89 78.8 90.8 57.7 InternVL3-78B 2549.8 89.0 / 88.7 87.7 81.3 70.0 72.5 82.0 59.1 3.85 79.2 90.3 58.1 Table 5: Comparison of comprehensive multimodal understanding and hallucination performance. Com- prehensive multimodal benchmarks include MME [37], MMBench series [75], MMVet series [138, 139], and MMStar [13]. Hallucination benchmarks encompass HallusionBench [ 45], MMHal [111], CRPE [126], and POPE [67]. Part of the results are sourced from the benchmark papers and the OpenCompass leaderboard [26]. 3.6 Comprehensive Multimodal Evaluation The comprehensive multimodal evaluation is based on established benchmarks including MME [37], MMBench"},{"citing_arxiv_id":"2501.17811","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling","primary_cat":"cs.AI","submitted_at":"2025-01-29T18:00:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Scaling data, model size, and training optimization on the Janus architecture yields better multimodal understanding and more stable, instruction-following text-to-image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.05271","ref_index":156,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","primary_cat":"cs.CV","submitted_at":"2024-12-06T18:57:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVL 2.5 is the first open-source MLLM to surpass 70% on the MMMU benchmark via model, data, and test-time scaling, with a 3.7-point gain from chain-of-thought reasoning.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"MME[ 68]: MME is the first comprehensive evaluation benchmark designed for MLLMs. It assesses models' perception and cognitive abilities across 14 subtasks, including object presence, counting, position, color recognition, as well as commonsense reasoning, numerical computation, text translation, and code reasoning. We report the overall score across all tasks. MMBench[ 156]: MMBench evaluates the multimodal understanding of MLLMs through nearly 3,000 multiple- choice questions spanning 20 dimensions. It supports both English and Chinese versions, and we present the model's performance scores on the test set. MMBench v1.1[ 156]: Compared to MMBench, MMBench v1.1 features a refined dataset with a small number of noisy or low-quality questions removed, resulting in a subtle improvement in overall data quality."},{"citing_arxiv_id":"2409.18869","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Emu3: Next-Token Prediction is All You Need","primary_cat":"cs.CV","submitted_at":"2024-09-27T16:06:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emu3 shows that next-token prediction on a unified discrete token space for text, images, and video lets a single transformer outperform task-specific models such as SDXL and LLaVA-1.6 in multimodal generation and perception.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2409.02813","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding Benchmark","primary_cat":"cs.CL","submitted_at":"2024-09-04T15:31:26+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MMMU-Pro is a stricter multimodal benchmark that removes text-only solvable questions, augments options, and requires reading text from images, yielding substantially lower model scores of 16.8-26.9%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.01800","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MiniCPM-V: A GPT-4V Level MLLM on Your Phone","primary_cat":"cs.CV","submitted_at":"2024-08-03T15:02:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MiniCPM-Llama3-V 2.5 delivers GPT-4V-level multimodal performance on phones through architecture, pretraining, and alignment optimizations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.04264","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MLVU: Benchmarking Multi-task Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2024-06-06T17:09:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MLVU is a new benchmark for long video understanding that uses extended videos across diverse genres and multi-task evaluations, revealing that current MLLMs struggle significantly and degrade sharply with longer durations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.16821","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","primary_cat":"cs.CV","submitted_at":"2024-04-25T17:59:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"InternVL 1.5 narrows the performance gap to proprietary multimodal models via a stronger transferable vision encoder, dynamic high-resolution tiling, and curated English-Chinese training data.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"2 / 82.0 69.8 62.8 76.0 49.3 53.5 Table 2. Comparison with SoTA models on 16 multimodal benchmarks. OCR-related benchmarks include: DocVQA test [82], ChartQA test [81], InfographicVQA test [83], TextVQA val [100], and OCRBench [67]. General multimodal benchmarks encompass: MME [26], RealWorldQA [125], AI2D test [39], MMMU val [135], MMBench-EN/CN test [66], CCBench dev [66], MMVet [133], SEED Image [46], and HallusionBench [30]. Additionally, the math dataset includes MathVista testmini [75]. * denotes that Rosetta OCR tokens are used in the testing of TextVQA. The MME results we report are the sum of the perception and cognition scores. The results of OCRBench, MMBench, CCBench, and HallusionBench are collected from the OpenCompass leaderboard [21]."},{"citing_arxiv_id":"2403.20330","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","primary_cat":"cs.CV","submitted_at":"2024-03-29T17:59:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Current LVLM benchmarks overestimate capabilities because many questions can be answered without images due to design flaws or data leakage; MMStar is a human-curated set of 1,500 vision-indispensable samples across 6 capabilities and 18 axes with new metrics for leakage and true multi-modal gain.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.12793","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","primary_cat":"cs.CV","submitted_at":"2023-11-21T18:58:11+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new 1.2M-caption dataset generated via GPT-4V improves LMMs on MME and MMBench by 222.8/22.0/22.3 and 2.7/1.3/1.5 points respectively when used for supervised fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.10122","ref_index":93,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video-LLaVA: Learning United Visual Representation by Alignment Before Projection","primary_cat":"cs.CV","submitted_at":"2023-11-16T10:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video-LLaVA creates a unified visual representation for images and videos via pre-projection alignment, enabling mutual enhancement from joint training and strong results on image and video benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.03744","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improved Baselines with Visual Instruction Tuning","primary_cat":"cs.CV","submitted_at":"2023-10-05T17:59:56+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Simple changes to LLaVA using CLIP-ViT-L-336px, an MLP connector, and academic VQA data yield state-of-the-art results on 11 benchmarks with only 1.2M public examples and one-day training on 8 A100 GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2307.16125","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension","primary_cat":"cs.CL","submitted_at":"2023-07-30T04:25:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SEED-Bench is a new benchmark of 19K multiple-choice questions for evaluating generative comprehension in multimodal LLMs across 12 image and video dimensions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2306.13394","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2023-06-23T09:22:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MME is a manually annotated benchmark evaluating MLLMs on perception and cognition across 14 subtasks to avoid data leakage and support fair model comparisons.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}