{"total":11,"items":[{"citing_arxiv_id":"2605.14291","ref_index":55,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"To See is Not to Learn: Protecting Multimodal Data from Unauthorized Fine-Tuning of Large Vision-Language Model","primary_cat":"cs.CR","submitted_at":"2026-05-14T02:49:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MMGuard generates unlearnable multimodal examples via perturbations that exploit LVLM optimization shortcuts and disrupt cross-modal bindings, providing robust protection against unauthorized fine-tuning across threat models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10622","ref_index":68,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Vocabulary Hijacking in LVLMs: Unveiling Critical Attention Heads by Excluding Inert Tokens to Mitigate Hallucination","primary_cat":"cs.MM","submitted_at":"2026-05-11T14:16:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LVLMs show vocabulary hijacking by inert tokens that decode to hijacking anchors; HABI locates them, NHAR finds resilient heads, and HAVAE boosts those heads to cut hallucinations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06339","ref_index":29,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"A Regime Theory of Controller Class Selection for LLM Action Decisions","primary_cat":"cs.AI","submitted_at":"2026-05-07T14:28:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A regime theory selects the optimal controller class for LLM action decisions from a nested lattice of four classes using three data-estimable bottlenecks, with a Bernstein-tight threshold and empirical matches on multiple benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05668","ref_index":92,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Large Vision-Language Models Get Lost in Attention","primary_cat":"cs.AI","submitted_at":"2026-05-07T04:45:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"In LVLMs, attention can be replaced by random Gaussian weights with little or no performance loss, indicating that current models get lost in attention rather than efficiently using visual context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19503","ref_index":29,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"ReaLB: Real-Time Load Balancing for Multimodal MoE Inference","primary_cat":"cs.DC","submitted_at":"2026-04-21T14:22:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ReaLB balances multimodal MoE inference loads by switching vision-heavy experts to lower FP4 precision per device rank, hiding the change in the dispatch phase to deliver 1.10-1.32x speedup with <1% accuracy degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05225","ref_index":27,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"MACS: Modality-Aware Capacity Scaling for Efficient Multimodal MoE Inference","primary_cat":"cs.LG","submitted_at":"2026-04-19T07:25:39+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04075","ref_index":4,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"RetentiveKV: State-Space Memory for Uncertainty-Aware Multimodal KV Cache Eviction","primary_cat":"cs.LG","submitted_at":"2026-04-14T08:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RetentiveKV uses entropy to drive state-space model transitions that retain and reactivate low-attention visual tokens in a continuous memory instead of pruning them, delivering 5x KV cache compression and 1.5x faster decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.04840","ref_index":96,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models","primary_cat":"cs.CV","submitted_at":"2024-08-09T03:25:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"mPLUG-Owl3 introduces hyper attention blocks to integrate vision and language for long image-sequence understanding and reports SOTA results on single-image, multi-image, and video benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.09353","ref_index":37,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"DoRA: Weight-Decomposed Low-Rank Adaptation","primary_cat":"cs.CL","submitted_at":"2024-02-14T17:59:34+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DoRA improves LoRA by decomposing weights into magnitude and direction and updating only direction with low-rank matrices, closing much of the gap to full fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.03568","ref_index":27,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Agent AI: Surveying the Horizons of Multimodal Interaction","primary_cat":"cs.AI","submitted_at":"2024-01-07T19:11:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper defines Agent AI as interactive multimodal systems that perceive grounded data and generate embodied actions, arguing this approach can mitigate hallucinations in foundation models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.10122","ref_index":91,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Video-LLaVA: Learning United Visual Representation by Alignment Before Projection","primary_cat":"cs.CV","submitted_at":"2023-11-16T10:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video-LLaVA creates a unified visual representation for images and videos via pre-projection alignment, enabling mutual enhancement from joint training and strong results on image and video benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}