{"total":30,"items":[{"citing_arxiv_id":"2606.31734","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MemLearner: Learning to Query Context memory for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-06-30T14:31:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemLearner introduces a learning-based adaptive context query method using query tokens in video world models to improve long-term scene consistency over rule-based retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18610","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SC3-Eval: Evaluating Robot Foundation Models via Self-Consistent Video Generation","primary_cat":"cs.RO","submitted_at":"2026-06-17T02:15:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SC3-Eval enforces three consistencies on a video model to produce policy rollouts that correlate 0.929 with real-world performance across seven vision-language-action policies and reproduce observed failure modes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09828","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent Spatial Memory for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-06-08T17:59:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mirage stores and queries 3D scene information in diffusion latent space via depth-guided lifting and warping, yielding 10.57× faster generation and 55× smaller memory than explicit RGB point-cloud baselines while reaching SOTA on WorldScore.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09056","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MilliVid: Hierarchical Latents for Long-Range Consistency in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-06-08T05:46:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MilliVid compresses video frames into multi-scale token hierarchies and uses coarse-to-fine rollout in a diffusion model to maintain long-range geometric and object consistency on Minecraft videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02753","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MetaWorld: Scaling Multi-Agent Video World Model from Single-view Video Data","primary_cat":"cs.CV","submitted_at":"2026-06-01T18:20:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MetaWorld scales multi-agent video world models from single-view videos using monocular decomposition into ego-motion and trajectories, subject-aware generation, and cross-attention alignment for consistency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02575","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Zero to Hero: Training-Free Custom Concept Spawning in World Models","primary_cat":"cs.CV","submitted_at":"2026-06-01T17:59:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SPAWN enables training-free insertion of custom visual concepts into autoregressive world models by swapping the pinned context-memory anchor over a short injection window.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02553","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LongLive-RAG: A General Retrieval-Augmented Framework for Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-06-01T17:50:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LongLive-RAG formulates long video generation as retrieval-augmented generation by treating self-generated latents as a dynamic searchable history and adding a Window Temporal Delta Loss for better retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02436","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Geometry-Aware Implicit Memory for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-06-01T16:08:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GIM-World adds a camera-queryable geometry distillation head and pruning rule to implicit memory in video world models, claiming better long-horizon geometric consistency on the MIND benchmark than explicit and implicit baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00793","ref_index":83,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MBench: A Comprehensive Benchmark on Memory Capability for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-30T16:17:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MBench is a new benchmark that quantifies long-term memory in video world models via three hierarchical consistency dimensions evaluated on curated real videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31336","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DecMem: Towards Minute-Long Consistent World Generation with Decoupled Memory","primary_cat":"cs.CV","submitted_at":"2026-05-29T14:17:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DecMem proposes a decoupled memory system using sparse global and anchored local components to enable consistent minute-long controllable video generation in world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31158","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Light Interaction: Training-Free Inference Acceleration for Interactive Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-29T11:06:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Light Interaction accelerates interactive video world models up to 2.59x via adaptive context management, denoising cache acceleration, and 3D block sparse attention without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30855","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Robust Dreamer: Deviation-Aware Latent Gaussian Memory for Action-Controlled AR Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-29T05:21:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Robust Dreamer uses Latent Gaussian Memory anchored to diffusion latents and Deviation Learning with a Dynamic Deviation Archive to reduce drift in long-horizon action-controlled image-to-video generation, reporting SOTA results on ScanNet, DL3DV, and OmniWorldGame.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28816","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Gamma-World: Generative Multi-Agent World Modeling Beyond Two Players","primary_cat":"cs.CV","submitted_at":"2026-05-27T17:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-agent video world model using simplex rotary agent encoding and sparse hub attention achieves better fidelity, controllability, and consistency than baselines while generalizing from 2 to 4 players.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18365","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GeoFlow: Enforcing Implicit Geometric Consistency in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T13:17:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoFlow adds a geometry-consistency reward based on rigid camera flow and object appearance preservation, integrated via reinforcement fine-tuning to improve geometric coherence in video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15042","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EverAnimate: Minute-Scale Human Animation via Latent Flow Restoration","primary_cat":"cs.CV","submitted_at":"2026-05-14T16:36:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EverAnimate restores drifted latent flow trajectories in chunked video generation via persistent latent propagation and restorative flow matching, achieving measurable gains in PSNR, SSIM, LPIPS, and FID over prior long-animation methods with only LoRA tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14487","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Head Forcing: Long Autoregressive Video Generation via Head Heterogeneity","primary_cat":"cs.CV","submitted_at":"2026-05-14T07:27:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Head Forcing assigns tailored KV cache strategies to local, anchor, and memory attention heads plus head-wise RoPE re-encoding to extend autoregressive video generation from seconds to minutes without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12496","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CausalCine: Real-Time Autoregressive Generation for Multi-Shot Video Narratives","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:59:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CausalCine enables real-time causal autoregressive multi-shot video generation via multi-shot training, content-aware memory routing for coherence, and distillation to few-step inference.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Streaming AR models typically retain recent frames together with fixed anchors or sink tokens from the sequence beginning [47, 51], while other methods compress history into compact representations or maintain multi-scale short- and long-term memory [57, 10, 14, 15]. More recent work explores adaptive memory, retrieving history based on camera pose, field-of-view overlap, 3D scene structure, or content relevance [49, 56, 24, 3, 21, 11]. Inspired by these directions, we integrate content-aware memory retrieval directly into the visual KV cache, and show that such adaptive memory is effective for the more challenging setting of few-step causal multi-shot generation. 3 Method We organize our framework around the design rationale thatcausality and multi-shot structure should"},{"citing_arxiv_id":"2605.18803","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PROWL: Prioritized Regret-Driven Optimization for World Model Learning","primary_cat":"cs.LG","submitted_at":"2026-05-11T14:24:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PROWL introduces a KL-constrained adversarial curriculum and prioritized adversarial trajectory buffer to actively discover and correct rare failure modes in action-conditioned video world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01896","ref_index":48,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Divide and Conquer: Decoupled Representation Alignment for Multimodal World Models","primary_cat":"cs.CV","submitted_at":"2026-05-03T14:22:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"M²-REPA decouples modality-specific features from diffusion intermediates and aligns them to complementary expert foundation models via a multi-modal alignment loss and modality-specific decoupling regularization for improved multimodal video generation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"geometricpriors[2,20,27,31,48,54,56,58].Recentworkshowsthatleveragingpre- trained foundation model features substantially improves diffusion-based genera- tion quality and consistency [19,46,48,58,60]. The REPA family [19,58,60] aligns clean foundation model features with early-stage diffusion features via regular- ization, while Pixel-Perfect Depth [48] directly injects features into pixel-space diffusion blocks. However, prior works primarily focus on unimodal generation with a single foundation model. By contrast, our method synergistically exploits multiple foundation models to enhance generation quality across modalities. 3 Preliminaries 3.1 Problem Formulation Given a single initial RGB imageX(0)"},{"citing_arxiv_id":"2605.01694","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent State Design for World Models under Sufficiency Constraints","primary_cat":"cs.AI","submitted_at":"2026-05-03T03:19:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"World models succeed when their latent states are built to meet task-specific sufficiency constraints rather than preserving the maximum amount of information.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"2022-2025 IRIS [42], GAIA-1 [30], GAIA-2 [50] Representation prediction Embedding-space prediction 2023-2026 I-JEPA [2], V-JEPA 2 [3], V-JEPA 2.1 [44], LeWorldModel [40] Reward / value-shaped Reward and policy-relevant supervision 2019-2021 TPC [46], value-aligned latent planning [28] Value-equivalent Bellman-relevant statistics only 2020-2023 MuZero [52], EfficientZero [66], TD-MPC2 [26] Causal / counterfactual Intervention-sensitive structural variables 2026 Causal-JEPA [45], CausalV AE-WM [14] Table 1 maps design targets along this spectrum. Physical-reasoning probes [39, 67] reinforce the axis by separating visual fidelity from physical and causal correctness. 2.2 Relationships among sufficiency constraints The six roles are descriptive."},{"citing_arxiv_id":"2604.19741","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CityRAG: Stepping Into a City via Spatially-Grounded Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-21T17:59:03+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Most popular formulations include text-to-video (T2V) [50,67] and image- to-video (I2V) [3,5,6] generation due to their scalability, and they can then be finetunedbasedontherequirementsofdownstreamapplications.Ourapplication requires long-term consistency, pose control, and integration of external context. Long-termconsistency.Worksinlong-contextorautoregressivegeneration[7, 8,27,44,52,65,70] maintain consistency by balancing computational efficiency and storing past samples. Another line of work creates an explicit memory like point clouds [22,46,64,68]. However, these works rarely show the capacity to generate minutes-long videos without significant degradation, and have an or- thogonal focus to our work. CityRAG retrieves external context, rather than"},{"citing_arxiv_id":"2604.18564","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MultiWorld: Scalable Multi-Agent Multi-View Video World Models","primary_cat":"cs.CV","submitted_at":"2026-04-20T17:52:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiWorld is a scalable framework for multi-agent multi-view video world models that improves controllability and consistency over single-agent baselines in game and robot tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"that responds to interactive control signals has evolved rapidly. Existing mod- els incorporate various signals like camera controls [13,36,52,65] and action controls [7,10,12,35] to simulate future states. Recent studies have explored sev- eral essential properties [16] of interactive video world models, such as physical consistency [37,47,49,72], and long-horizon coherence [53,56,62], alongside effi- cient real-time generation [17,61,66,75] to enable practical deployment. With these properties, world models can serve as powerful simulators for downstream tasks like game generation [39,55], embodied AI [6,27], and autonomous driv- ing [33,58]. Game video world models [44,64] control the environment and simu- late player observations based on provided actions."},{"citing_arxiv_id":"2604.18215","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Memorize When Needed: Decoupled Memory Control for Spatially Consistent Long-Horizon Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-20T13:00:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A decoupled memory branch with hybrid cues, cross-attention, and gating improves spatial consistency and data efficiency in long-horizon camera-trajectory video generation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"leading to inconsistent content and visual discontinuities. Toovercomecontextlimitations,manyrecentmethodshaveintroducedmem- ory retrieval mechanisms [30,43,52,55]. By leveraging camera trajectories as geometric cues, these approaches dynamically fetch relevant historical frames to guide the synthesis, aiming to maintain visual consistency during revisiting. For example, WorldMem [52] and Context-as-Memory [55] concurrently propose FOV-overlap scoring for memory selection, a practical strategy adopted by Hun- yuanworld1.5 [43]. Despite the potential of retrieval-augmented memory, such an architecture entanglement introduces a compromise between memory adherence and generative quality, inevitably leading to suboptimal long-term consistency"},{"citing_arxiv_id":"2604.13036","ref_index":118,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lyra 2.0: Explorable Generative 3D Worlds","primary_cat":"cs.CV","submitted_at":"2026-04-14T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lyra 2.0 produces persistent 3D-consistent video sequences for large explorable worlds by using per-frame geometry for information routing and self-augmented training to correct temporal drift.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"To address this limitation, recent works augment generative models with explicit memory mechanisms. A first family of approaches [24, 51, 118, 128] relies on retrieval-based memory. These methods treat past frames as an external memory bank and dynamically select relevant observations to guide the next generation. For example, Context-as-Memory [128] and WorldMem [118] retrieve earlier frames based on field-of-view (FOV) overlap, while VMem [51] performs geometry-aware retrieval using indexed 3D surface elements instead of purely view-based similarity. A second line of work [48, 62, 116, 139, 141] enforces spatial persistence through explicit 3D representations accumulated over time. Rather than retrieving individual frames, these"},{"citing_arxiv_id":"2604.08995","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Matrix-Game 3.0: Real-Time and Streaming Interactive World Model with Long-Horizon Memory","primary_cat":"cs.CV","submitted_at":"2026-04-10T06:00:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Matrix-Game 3.0 delivers 720p real-time video generation at 40 FPS with minute-scale memory consistency by combining residual self-correction training, camera-aware memory injection, and DMD-based autoregressive distillation on a 5B model.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"based on camera awareness and inject them via cross-attention mechanism. Compared to MoC-style routing, this improves retrieval stability. However, the additional memory branch with misaligned features, together with layer-wise repeated feature injection, results in slow convergence. Even with geometry-aware cues inspired by prior memory-based world models [45], the performance gains remain limited. Figure 5: Frame-level self-attention visualiza- tion for the memory-enhanced DiT. Based on these observations, we adopt a unified DiT framework that jointly models long-term memory, temporally consistent history, and the current predic- tion target. Our first key design is a joint self-attention mech- anism. Instead of treating memory as an external"},{"citing_arxiv_id":"2604.06339","ref_index":282,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evolution of Video Generative Foundations","primary_cat":"cs.CV","submitted_at":"2026-04-07T18:17:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"This survey traces video generation technology from GANs to diffusion models and then to autoregressive and multimodal approaches while analyzing principles, strengths, and future trends.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"infinite scalability, Stable Video Infinity [279] utilizes error- recycling training, and Mixture of Contexts [280] adopts learnable sparse routing to maintain consistency over effectively infinite durations. Beyond training strategies, ensuring object permanence requires specific memory ar- chitectures. Retrieval-based systems, such as Context-as- Memory [281], WORLDMEM [282], and WorldPlay [283], treat historical frames as a searchable database to condition current generation. Alternatively, implicit state updates, like TTT-layers [284], utilize RNN-style hidden states to dynamically internalize environmental features. Despite improving temporal coherence, these latent-based methods fundamentally lack explicit 3D spatial constraints, making"},{"citing_arxiv_id":"2602.02958","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Quant VideoGen: Auto-Regressive Long Video Generation via 2-Bit KV-Cache Quantization","primary_cat":"cs.LG","submitted_at":"2026-02-03T00:54:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Quant VideoGen reduces KV cache memory by up to 7 times in autoregressive video diffusion models via semantic aware smoothing and progressive residual quantization, achieving better quality than baselines with under 4% latency overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.14614","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WorldPlay: Towards Long-Term Geometric Consistency for Real-Time Interactive World Modeling","primary_cat":"cs.CV","submitted_at":"2025-12-16T17:22:46+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.07982","ref_index":81,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Geometry Forcing: Marrying Video Diffusion and 3D Representation for Consistent World Modeling","primary_cat":"cs.CV","submitted_at":"2025-07-10T17:55:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Geometry Forcing aligns video diffusion representations with geometric foundation model features via angular cosine and scale regression objectives to improve 3D consistency in generated videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.21996","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VRAG: Learning World Models for Interactive Video Generation","primary_cat":"cs.CV","submitted_at":"2025-05-28T05:55:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}