{"total":13,"items":[{"citing_arxiv_id":"2605.19950","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AffectVerse: Emotional World Models for Multimodal Affective Computing","primary_cat":"cs.CV","submitted_at":"2026-05-19T15:05:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AffectVerse improves multimodal emotion recognition by at least 2.57% on nine benchmarks through an Emotion World Module that performs short-horizon latent affective prediction via cross-modal temporal imagination and belief aggregation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16403","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Vision Speaks for Sound","primary_cat":"cs.CV","submitted_at":"2026-05-13T05:00:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video MLLMs show an audio-visual Clever Hans effect relying on visual-acoustic correlations rather than audio verification; Thud interventions diagnose it and a 10K-sample preference alignment improves intervention performance by 28 points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09749","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"See Fair, Speak Truth: Equitable Attention Improves Grounding and Reduces Hallucination in Vision-Language Alignment","primary_cat":"cs.CV","submitted_at":"2026-04-10T11:01:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Equitable attention via Dominant Object Penalty and Outlier Boost Coefficient reduces object hallucinations in multimodal LLMs without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.05067","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLaVA-Octopus: Unlocking Instruction-Driven Adaptive Projector Fusion for Video Understanding","primary_cat":"cs.CV","submitted_at":"2025-01-09T08:43:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLaVA-Octopus introduces instruction-driven adaptive fusion of multiple visual projectors in a multimodal LLM to improve video understanding performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.02327","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PPLLaVA: Varied Video Sequence Understanding With Prompt Guidance","primary_cat":"cs.CV","submitted_at":"2024-11-04T17:50:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PPLLaVA uses CLIP-based alignment and prompt-guided convolution-style pooling to reduce visual tokens 18x in Video LLMs, achieving SOTA results on captioning, QA, and long-form reasoning benchmarks with higher throughput.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.17434","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding","primary_cat":"cs.CV","submitted_at":"2024-10-22T21:21:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LongVU adaptively compresses long video tokens using DINOv2-based frame deduplication, text-guided cross-modal selection, and temporal spatial reduction to improve video-language understanding in MLLMs with minimal detail loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.10188","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","primary_cat":"cs.CV","submitted_at":"2024-08-19T17:48:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LongVILA scales visual-language models from 8 to 2048 video frames with 99.8% needle-in-a-haystack accuracy using long-context extension, supervised fine-tuning, and multi-modal sequence parallelism on up to 256 GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2407.03320","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InternLM-XComposer-2.5: A Versatile Large Vision Language Model Supporting Long-Contextual Input and Output","primary_cat":"cs.CV","submitted_at":"2024-07-03T17:59:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InternLM-XComposer-2.5 is a 7B vision-language model supporting up to 96K context that reaches GPT-4V-level performance on image, video, and multi-turn tasks and adds LoRA-driven text-image composition capabilities.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Table 1. Datasets used for Pre-Training. The data are collected from diverse sources for the three objectives. Task Dataset Caption ShareGPT4V [17], COCO [21], Nocaps [1] General QA VQAv2 [4], GQA [53], OK-VQA [105] VD [32], RD [16], VSR [81], ALLaV A-QA [15] Multi-Turn QA MMDU [92] Science QA AI2D [61], SQA [98], TQA [62], IconQA [97] Chart QA DVQA [58], ChartQA [106], ChartQA-AUG [106] Math QA MathQA [161], Geometry3K [96], TabMWP [99], CLEVR-MATH [80], Super [75] World Knowledge QA A-OKVQA [127], KVQA [128], ViQuAE [65] OCR QA TextVQA [133], OCR-VQA [109], ST-VQA [11] HD-OCR QA InfoVQA[108], DocVQA [107], TabFact [20], WTQ [117], DeepForm [139], Visual MRC [140] Video ShareGPT4Video [19], ActivityNet [37]"},{"citing_arxiv_id":"2406.16852","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Long Context Transfer from Language to Vision","primary_cat":"cs.CV","submitted_at":"2024-06-24T17:58:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Extending language model context length enables LMMs to process over 200K visual tokens from long videos without video training, achieving SOTA on Video-MME via dense frame sampling.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"CV] 1 Jul 2024 Model Tokens/Frames * Training Max Frames* LM Backbone LM Context Length MPLUG-Owl-video [82] 256 4 LLaMA 4K MovieChat [64] 32 8 Vicuna-v0 2K Video-LLaV A [87] 49 8 Vicuna-1.5 4K VideoChat [39] 32 /196 8 Vicuna-v0 2K LLaV A-NeXT-Video [89] 144 16 Vicuna-1.5 4K ST-LLM [49] 256 16 Vicuna-1.1 2K Video-LLaMA [15] 32 32 LLaMA-2 4K Chat-UniVi [30] 112 64 Vicuna-1.5 4K TimeChat [60] 4 96 LLaMA-2 4K Video-ChatGPT [50] 256 100 Vicuna-1.1 2K LLaMA-VID [41] 2 300 Vicuna-1.5 4K LongV A (Ours) 144 - Qwen2-Extended 224K+ Table 1: To enable longer video inputs, previous works train fewer visual tokens to increase the maximum frames during training. Our LongV A, on the other hand, enables long video capability by"},{"citing_arxiv_id":"2406.07476","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs","primary_cat":"cs.CV","submitted_at":"2024-06-11T17:22:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VideoLLaMA 2 improves video LLMs via a new STC connector for spatial-temporal dynamics and joint audio training, reaching competitive results on video QA and captioning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.16994","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning","primary_cat":"cs.CV","submitted_at":"2024-04-25T19:29:55+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A temporal pooling layer added to LLaVA smooths video feature distributions and lifts performance on dense video captioning and QA to new SOTA levels without extra parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.00476","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TempCompass: Do Video LLMs Really Understand Videos?","primary_cat":"cs.CV","submitted_at":"2024-03-01T12:02:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TempCompass benchmark reveals that state-of-the-art Video LLMs have poor ability to perceive temporal aspects such as speed, direction, and ordering in videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.10122","ref_index":109,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Video-LLaVA: Learning United Visual Representation by Alignment Before Projection","primary_cat":"cs.CV","submitted_at":"2023-11-16T10:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video-LLaVA creates a unified visual representation for images and videos via pre-projection alignment, enabling mutual enhancement from joint training and strong results on image and video benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}