{"total":15,"items":[{"citing_arxiv_id":"2606.01362","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AlbedoEdit: Unified Instance-Level Video Editing with Albedo Guidance","primary_cat":"cs.GR","submitted_at":"2026-05-31T17:33:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AlbedoEdit fine-tunes video foundation models to translate RGB videos into edited versions conditioned on user-edited first-frame albedo maps, trained on a new synthetic paired dataset for insertion, removal, and texture tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30045","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GenEraser: Generalizable Video Object Removal via Balanced Text-Mask Guidance and Decoupled Locator-Preserver","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:58:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenEraser proposes MC-MoE with bipartite text guidance, LD-CFG fusion, and a decoupled locator-preserver architecture for generalizable video object and effect removal, claiming 2.16 dB and 1.44 dB gains on ROSE and VOR-Eval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22344","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bernini: Latent Semantic Planning for Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-21T11:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bernini is a framework that uses an MLLM planner to output semantic representations for a DiT renderer to generate or edit videos, reporting SOTA benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15843","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WorldAct: Activating Monolithic 3D Worlds into Interactive-Ready Object-Centric Scenes","primary_cat":"cs.CV","submitted_at":"2026-05-15T10:56:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WorldAct activates monolithic 3D worlds into interactive scenes via multimodal agent-guided decomposition, geometrically aligned mesh reconstruction, and 3D inpainting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14534","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PROVE: A Perceptual RemOVal cohErence Benchmark for Visual Media","primary_cat":"cs.CV","submitted_at":"2026-05-14T08:16:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PROVE proposes RC metrics for perceptual removal coherence and releases PROVE-Bench to better align automatic scores with human judgments on object removal tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09897","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Tube-Structured Incremental Semantic HARQ for Generative Video Receivers","primary_cat":"eess.IV","submitted_at":"2026-05-11T02:38:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tube-structured incremental semantic HARQ reduces time-weighted recovery cost and enables earlier stabilization in generative video reconstruction compared to block-based methods under matched budgets and channel conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06535","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sparkle: Realizing Lively Instruction-Guided Video Background Replacement via Decoupled Guidance","primary_cat":"cs.CV","submitted_at":"2026-05-07T16:35:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparkle supplies a large-scale dataset and benchmark for instruction-driven video background replacement, enabling models that generate more natural and temporally consistent new scenes than earlier approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27322","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"YOSE: You Only Select Essential Tokens for Efficient DiT-based Video Object Removal","primary_cat":"cs.CV","submitted_at":"2026-04-30T02:08:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"YOSE accelerates DiT video object removal up to 2.5x by using BVI for adaptive token selection and DiffSim to simulate unmasked token effects, while preserving visual quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08546","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Numbers Speak: Aligning Textual Numerals and Visual Instances in Text-to-Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NUMINA improves counting accuracy in text-to-video diffusion models by up to 7.4% via a training-free identify-then-guide framework on the new CountBench dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05898","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Physics-Aware Video Instance Removal Benchmark","primary_cat":"cs.CV","submitted_at":"2026-04-07T14:02:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The PVIR benchmark tests video object removal on physical consistency using 95 annotated videos and shows that existing methods struggle with complex interactions like lingering shadows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04331","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GA-GS: Generation-Assisted Gaussian Splatting for Static Scene Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-06T00:47:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GA-GS uses motion segmentation, diffusion-based inpainting for pseudo-ground-truth, and per-Gaussian authenticity scalars to achieve SOTA static scene reconstruction from videos with dynamic occlusions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.21901","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CLEAR: Context-Aware Learning with End-to-End Mask-Free Inference for Adaptive Video Subtitle Removal","primary_cat":"cs.CV","submitted_at":"2026-03-23T12:23:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLEAR achieves end-to-end mask-free video subtitle removal via dual-encoder self-supervised orthogonality and LoRA-based generation feedback, delivering +6.77 dB PSNR gains and strong zero-shot multilingual performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.09283","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Ideal to Real: Stable Video Object Removal under Imperfect Conditions","primary_cat":"cs.CV","submitted_at":"2026-03-10T07:07:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SVOR achieves stable, shadow-free video object removal under real-world imperfections via MUSE mask handling, DA-Seg localization, and curriculum training on real and synthetic data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.07469","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VideoCoF: Unified Video Editing with Temporal Reasoner","primary_cat":"cs.CV","submitted_at":"2025-12-08T11:50:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VideoCoF adds an explicit reasoning step using edit-region latents in video diffusion models to enable precise mask-free editing and motion alignment with only 50k training pairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.20360","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EditVerse: Unifying Image and Video Editing and Generation with In-Context Learning","primary_cat":"cs.CV","submitted_at":"2025-09-24T17:59:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EditVerse unifies image and video editing and generation in one transformer model via unified token sequences and in-context learning, trained jointly on curated video editing data plus image/video corpora and evaluated on a new instruction-based benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}