{"total":17,"items":[{"citing_arxiv_id":"2605.23891","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Smart-Insertion-V: Photorealistic Video Insertion via a Closed-Loop Feedback Dual-Stream Framework","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:54:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Smart-Insertion-V is a dual-stream closed-loop framework with Dual-World-View RoPE and a Decoupled Guidance Module that inserts reference objects into videos while achieving stylistic harmony despite domain gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22818","ref_index":72,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MotiMotion: Motion-Controlled Video Generation with Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-21T17:59:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MotiMotion adds visual reasoning via a training-free VLM to refine primary trajectories and hallucinate secondary motions, plus a confidence-aware guidance scheme, yielding more plausible interactions on the new MotiBench benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22344","ref_index":76,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bernini: Latent Semantic Planning for Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-21T11:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bernini is a framework that uses an MLLM planner to output semantic representations for a DiT renderer to generate or edit videos, reporting SOTA benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20795","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Semantics Survive the Connector? Diagnosing VLM-to-DiT Alignment in Video Editing","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:42:15+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18748","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aurora: Unified Video Editing with a Tool-Using Agent","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:59:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Aurora introduces a VLM-based agent that converts raw user video edit requests into structured conditioning inputs for a unified diffusion transformer, improving performance on underspecified tasks via a new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18678","ref_index":120,"ref_count":4,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lance: Unified Multimodal Modeling by Multi-Task Synergy","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:18:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lance presents a dual-stream mixture-of-experts model with modality-aware positional encoding and staged multi-task training that outperforms prior open-source unified models on image and video generation while keeping strong understanding performance.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"understanding and safety-aware generation. In Proceedings of the 32nd ACM International Conference on Multimedia, pages 3907-3916, 2024. 32 [119] Cong Wei, Quande Liu, Zixuan Ye, Qiulin Wang, Xintao Wang, Pengfei Wan, Kun Gai, and Wenhu Chen. Univideo: Unified understanding, generation, and editing for videos.arXiv preprint arXiv:2510.08377, 2025. [120] Bin Wu, Mengqi Huang, Shaojin Wu, Weinan Jia, Yuxin Wang, Zhendong Mao, and Yongdong Zhang. Stream-r1: Reliability-perplexity aware reward distillation for streaming video generation.arXiv preprint arXiv:2605.03849, 2026. [121] Bing Wu, Chang Zou, Changlin Li, Duojun Huang, Fang Yang, Hao Tan, Jack Peng, Jianbing Wu, Jiangfeng Xiong, Jie Jiang, et al."},{"citing_arxiv_id":"2605.06535","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sparkle: Realizing Lively Instruction-Guided Video Background Replacement via Decoupled Guidance","primary_cat":"cs.CV","submitted_at":"2026-05-07T16:35:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparkle supplies a large-scale dataset and benchmark for instruction-driven video background replacement, enabling models that generate more natural and temporally consistent new scenes than earlier approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02641","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mamoda2.5: Enhancing Unified Multimodal Model with DiT-MoE","primary_cat":"cs.CV","submitted_at":"2026-05-04T14:26:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Mamoda2.5 is a 25B-parameter DiT-MoE unified AR-Diffusion model that reaches top video generation and editing benchmarks with 4-step inference up to 95.9x faster than baselines.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"26 1.48 1.49 1.55 2.26 1.65 InsViE [52] 1.48 1.36 1.17 2.18 2.20 1.06 2.02 1.64 Lucy-Edit [53] 3.20 1.75 2.30 1.61 2.27 1.57 2.86 2.22 ICVE [54] 2.57 2.51 1.97 2.09 2.22 1.62 2.41 2.20 Ditto [35] 2.03 1.53 1.41 2.81 4.01 1.68 1.23 2.10 OpenVE-Edit [37] 2.98 1.85 2.15 2.91 3.16 2.36 2.31 2.53 Kiwi-Edit [56] 3.83 2.63 2.36 - 3.64 2.64 - 3.03* UniVideo [57] 3.86 2.48 2.87 3.13 3.47 2.47 - 3.05* OmniWeaving [58] 3.67 2.89 2.90 2.99 3.55 2.42 - 3.15* VInO [17] 3.73 3.22 2.77 2.614.342.54 3.29 3.21 Mamoda2.5 4.57 4.02 3.24 3.99 4.05 3.31 3.87 3.86 methods generally outperform most open-source methods, suggesting that current open-source video-editing datasets and data pipelines remain limited in both scale and quality."},{"citing_arxiv_id":"2604.24763","ref_index":42,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Tuna-2: Pixel Embeddings Beat Vision Encoders for Multimodal Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2026-04-27T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tuna-2 shows that direct pixel embeddings can replace vision encoders in unified multimodal models, achieving competitive generation and stronger understanding at scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19193","ref_index":73,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How Far Are Video Models from True Multimodal Reasoning?","primary_cat":"cs.CV","submitted_at":"2026-04-21T08:04:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Current video models succeed on basic understanding but achieve under 25% success on logically grounded generation and near 0% on interactive generation, exposing gaps in multimodal reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16272","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VEFX-Bench: A Holistic Benchmark for Generic Video Editing and Visual Effects","primary_cat":"cs.CV","submitted_at":"2026-04-17T17:28:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VEFX-Bench releases a large human-labeled video editing dataset, a multi-dimensional reward model, and a standardized benchmark that better matches human judgments than generic evaluators.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"preserving unrelated content [11, 12, 20-23]. Early methods extended image editing pipelines to the temporal domain, typically by introducing temporal attention or consistency modules on top of text-to-image diffusion models [24, 25]. More recent approaches adopt video-native diffusion or flow-matching architectures. Represen- tative research models include VACE [11], UniVideo [12], and the broader Wan family [1, 26]. Alongside them, commercial systems such as Kling Omni, Grok Imagine, Luma Ray2, and the commercial Wan 2.6 service variant have reached practical quality levels [9, 10, 26, 27]. The resulting ecosystem is highly heterogeneous, with different systems excelling on different editing types, which makes standardized evaluation increasingly"},{"citing_arxiv_id":"2604.14556","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Controllable Video Object Insertion via Multiview Priors","primary_cat":"cs.CV","submitted_at":"2026-04-16T02:39:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A multi-view prior-based framework for video object insertion that uses dual-path conditioning and an integration-aware consistency module to improve appearance stability and occlusion handling.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"To further assess the practical utility of our pipeline, we eval- uate a variant, Ours (t2i), which represents a full object insertion workflow. Specifically, we first employ Qwen-VL-Plus [2] to gener- ate a detailed textual description of the target object by analyzing its appearance in the initial frame. This prompt is then used by the Qwen-Image-2.0-Pro model [44] to synthesize a high-quality reference image, which is subsequently lifted into 3D via Hun- yuan3D 2.0 [36]. Despite the potential domain shift introduced by the multi-stage generative process, Ours (t2i) remains highly com- petitive, achieving a Box_IoU of 0.7405 and a PSNR of 21.27. This performance exceeds several baselines that rely on ground-truth"},{"citing_arxiv_id":"2604.08646","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InsEdit: Towards Instruction-based Visual Editing via Data-Efficient Video Diffusion Models Adaptation","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InsEdit adapts a video diffusion backbone for text-instruction video editing via Mutual Context Attention, achieving SOTA open-source results with O(100K) data while also supporting image editing.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Zhang, Zhengyi Wang, An Yang, Bowen Yu, Chen Cheng, Dayiheng Liu, Deqing Li, Hang Zhang, Hao Meng, Hu Wei, Jingyuan Ni, Kai Chen, Kuan Cao, Liang Peng, Lin Qu, Minggang Wu, Peng Wang, Shuting Yu, Tingkun Wen, Wensen Feng, Xiaoxiao Xu, Yi Wang, Yichang Zhang, Yongqiang Zhu, Yujia Wu, Yuxuan Cai, and Zenan Liu. 2025. Qwen-Image Technical Report. arXiv:2508.02324 [cs.CV] https://arxiv.org/abs/2508.02324 [37] Jay Zhangjie Wu, Yixiao Ge, Xintao Wang, Stan Weixian Lei, Yuchao Gu, Yufei Shi, Wynne Hsu, Ying Shan, Xiaohu Qie, and Mike Zheng Shou. 2023. Tune-a- video: One-shot tuning of image diffusion models for text-to-video generation. InProceedings of the IEEE/CVF International Conference on Computer Vision. 7623- 7633. [38] Xiaoshi Wu, Yixuan Jiao, Wen Wang, Zhiyu Tan, Xialei Lyu, Hanyu Li, Shijie"},{"citing_arxiv_id":"2604.07958","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ImVideoEdit: Image-learning Video Editing via 2D Spatial Difference Attention Blocks","primary_cat":"cs.CV","submitted_at":"2026-04-09T08:22:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ImVideoEdit learns video editing from 13K image pairs by decoupling spatial modifications from frozen temporal dynamics in pretrained models, matching larger video-trained systems in fidelity and consistency.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[35] Carl V ondrick, Hamed Pirsiavash, and Antonio Torralba. Generating videos with scene dynamics.Advances in neu- ral information processing systems, 29, 2016. 3 [36] Team Wan, Ang Wang, Baole Ai, Bin Wen, Chaojie Mao, Chen-Wei Xie, Di Chen, Feiwu Yu, Haiming Zhao, Jianx- iao Yang, et al. Wan: Open and advanced large-scale video generative models.arXiv preprint arXiv:2503.20314, 2025. 3 [37] Cong Wei, Quande Liu, Zixuan Ye, Qiulin Wang, Xintao Wang, Pengfei Wan, Kun Gai, and Wenhu Chen. Univideo: Unified understanding, generation, and editing for videos. arXiv preprint arXiv:2510.08377, 2025. 3 [38] Chenfei Wu, Jiahao Li, Jingren Zhou, Junyang Lin, Kaiyuan Gao, Kun Yan, Sheng ming Yin, Shuai Bai, Xiao Xu, Yilei Chen, Yuxiang Chen, Zecheng Tang, Zekai Zhang, Zhengyi"},{"citing_arxiv_id":"2604.05898","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Physics-Aware Video Instance Removal Benchmark","primary_cat":"cs.CV","submitted_at":"2026-04-07T14:02:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The PVIR benchmark tests video object removal on physical consistency using 95 annotated videos and shows that existing methods struggle with complex interactions like lingering shadows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.12370","ref_index":72,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLaMo: Scaling Pretrained Language Models for Unified Motion Understanding and Generation with Continuous Autoregressive Tokens","primary_cat":"cs.CV","submitted_at":"2026-02-12T20:02:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLaMo scales pretrained LLMs for unified motion-language tasks by encoding motion into continuous causal latents and adding a flow-matching head for real-time autoregressive generation and captioning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.07469","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VideoCoF: Unified Video Editing with Temporal Reasoner","primary_cat":"cs.CV","submitted_at":"2025-12-08T11:50:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VideoCoF adds an explicit reasoning step using edit-region latents in video diffusion models to enable precise mask-free editing and motion alignment with only 50k training pairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}