{"total":14,"items":[{"citing_arxiv_id":"2605.26104","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EVIDENT: Routing MLLM Adaptation through Entity-Grounded Visual Evidence for Cross-Domain Video Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-25T17:58:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EVIDENT routes MLLM adaptation for video temporal grounding through entity-grounded visual evidence using an Entity Bottleneck Adapter, Entity-Binding Distillation, and Entity-to-eVidence gating to improve cross-domain robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21973","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Foresee-to-Ground: From Predictive Temporal Perception to Evidence-Driven Reasoning for Video Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-21T04:03:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"F2G improves video temporal grounding accuracy by decoupling event identification from boundary measurement using predictive temporal perception to create citable evidence segments for LLM reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13803","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EvoGround: Self-Evolving Video Agents for Video Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-13T17:25:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A proposer-solver agent pair achieves supervised-level video temporal grounding and fine-grained captioning from 2.5K unlabeled videos via self-reinforcing evolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25886","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MarkIt: Training-Free Visual Markers for Precise Video Temporal Grounding","primary_cat":"cs.MM","submitted_at":"2026-04-28T17:29:19+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25276","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniVTG: A Large-Scale Dataset and Training Paradigm for Open-World Video Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2026-04-28T06:34:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniVTG creates a new large-scale open-world VTG dataset using iterative concept-gap filling and timestamped captioning, paired with a three-stage self-correction CoT paradigm that yields SOTA zero-shot results on four existing benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12148","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ViLL-E: Video LLM Embeddings for Retrieval","primary_cat":"cs.CV","submitted_at":"2026-04-13T23:54:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViLL-E introduces a dynamic embedding mechanism and joint contrastive-generative training for VideoLLMs, delivering up to 7% gains in temporal localization and 4% in video retrieval while enabling new zero-shot capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11283","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Multimodal Large Language Model-Enabled Video Translation: A Role-Oriented Survey","primary_cat":"cs.CV","submitted_at":"2026-04-13T10:42:31+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"RED-VILLM [22], InternVideo2 [23], Otter [24], VLog [25], TimeBlindness [26], Time-R1 [27] Spatio-Temporal Modeling MA-LMM [28], MovieLLM [29] , MovieChat [30], LongVLM [31], VideoStreaming [32], VideoLLM [33], VideoLLM-online [34], Vriptor [35], LLoVi [36], TimeChat [37], Momentor [38], LITA [39], SeViLA [40], VTG-LLM [41], VTimeLLM [42], HawkEye [43], Chat-UniVi [44], VideoGPT+ [45], ST-LLM [46], Slot-VLM [47], LSTP [48], OmniViD [49],Vid2Seq [50], DrVideo [51], ViLAMP [52], AKS [53], MCiT [54] Training Paradigms Video-LLaMA 2 [55], LLaMA-Adapter [19], AudioVisual [56], AVicuna [57], , SEAMLESSM4T [58], Au-HuBERT [59], Artemis [60], PLLaVA [61], PG-Video-LLaVA [62], GroundingGPT [63], Vidi [64], REEF [65], Video-xl [66]"},{"citing_arxiv_id":"2604.08966","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How Should Video LLMs Output Time? An Analysis of Efficient Temporal Grounding Paradigms","primary_cat":"cs.CV","submitted_at":"2026-04-10T05:10:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A controlled study on compact video LLMs finds that continuous temporal decoding delivers the strongest accuracy-efficiency trade-off for video temporal grounding across three benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08522","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UniversalVTG: A Universal and Lightweight Foundation Model for Video Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:57:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniversalVTG is a lightweight foundation model for video temporal grounding that achieves state-of-the-art results across five benchmarks while being over 100 times smaller than recent MLLM-based methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08014","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bridging Time and Space: Decoupled Spatio-Temporal Alignment for Video Grounding","primary_cat":"cs.CV","submitted_at":"2026-04-09T09:14:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Bridge-STG decouples spatio-temporal alignment via semantic bridging and query-guided localization modules to achieve state-of-the-art m_vIoU of 34.3 on VidSTG among MLLM methods.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"57, 61] have shifted toward an integrated encoder-decoder archi- tecture, bypassing the dependency on external detection modules. Within this unified paradigm, the encoder is responsible for in- tegrating multimodal cues from videos and text, and the decoder directly regresses the target's spatio-temporal coordinates, leading to enhanced performance. CG-STVG [13] and TubeDETR [57] em- ploy zero-initialized object queries, which lack target-specific cues and thus struggle to learn discriminative target information from Bridging Time and Space: Decoupled Spatio-Temporal Alignment for Video Grounding Arxiv, April, 2026 Figure 2: Overall architecture of Bridge-STG. The model first predicts the event's temporal window with the ETA strategy."},{"citing_arxiv_id":"2604.02860","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Paradigm Shift: Fully End-to-End Training for Temporal Sentence Grounding in Videos","primary_cat":"cs.CV","submitted_at":"2026-04-03T08:26:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Fully end-to-end training with a sentence-conditioned adapter outperforms frozen-backbone baselines for localizing video segments that match sentence queries.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"These Table 1. Comparison of state-of-the-art methods on Charades-STA and ActivityNet Captions dataset. Backbone Methods Charades-STA ActivityNet-Captions Rank1@ IoU0.5 Rank1@ IoU0.7 Rank5@ IoU0.5 Rank5@ IoU0.7 mIoU Rank1@ IoU0.5 Rank1@ IoU0.7 Rank5@ IoU0.5 Rank5@ IoU0.7 mIoU LLM-based Momentor [48] 26.60 11.60 - - 28.50 23.00 12.40 - - 29.30 HawkEye [60] 31.40 14.50 - - 33.70 29.30 10.70 - - 32.70 VideoExpert [78] 40.30 20.90 - - 41.10 - - - TRACE [18] 40.30 19.40 - - - 37.70 24.00 - - 39.00 D2VLM [71] 50.30 26.00 - - - - - - - - C3D MS-2D-TAN [76] 41.10 23.25 81.53 48.55 - 46.16 29.21 78.80 60.85 - APGN [29] 48.20 29.37 89.05 58.49 - - - - - - CPN [80] 46.08 26.05 - - 43.90 45.10 28.10 - - 45.70"},{"citing_arxiv_id":"2512.06673","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Detector-Empowered Video Large Language Model for Efficient Spatio-Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2025-12-07T06:11:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DEViL offloads spatial grounding to a detector via a distilled reference-semantic token and temporal consistency regularization, reaching 43.1% m_vIoU at 14.33 FPS on HC-STVG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.03963","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TempR1: Improving Temporal Understanding of MLLMs via Temporal-Aware Multi-Task Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2025-12-03T16:57:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TempR1 applies temporal-aware multi-task RL using GRPO and three types of localization rewards to achieve SOTA temporal understanding in MLLMs with synergistic gains from joint optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.12386","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InternVideo2.5: Empowering Video MLLMs with Long and Rich Context Modeling","primary_cat":"cs.CV","submitted_at":"2025-01-21T18:59:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InternVideo2.5 improves video MLLMs by incorporating dense vision task annotations via direct preference optimization and compact spatiotemporal representations via adaptive hierarchical token compression, yielding better benchmark performance, 6x longer video memory, and new capabilities likeobject","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}