{"total":28,"items":[{"citing_arxiv_id":"2607.01060","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RoboWorld: Fast and Reliable Neural Simulators for Generalist Robot Policy Evaluation","primary_cat":"cs.RO","submitted_at":"2026-07-01T15:22:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoboWorld introduces an automated pipeline using autoregressive video world models and task-progress VLM scoring, plus Step Forcing for long-horizon stability, to achieve high correlation with real robot policy evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30292","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DreamForge-World 0.1 Preview: A Low-Compute Real-Time Controllable World Model","primary_cat":"cs.LG","submitted_at":"2026-06-29T13:35:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A preview system demonstrates real-time controllable world modeling at 14-15 FPS on RTX 4090 by adapting open video backbones with action pathways for keyboard/mouse control and multimodal features.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18180","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EgoCS-400K: An Egocentric Gameplay Dataset for World Models","primary_cat":"cs.CV","submitted_at":"2026-06-16T17:13:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EgoCS-400K is a new 400K-video egocentric CS dataset with action-state-event alignment from public match demos for world model training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17730","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ActWorld: From Explorable to Interactive World Model via Action-Aware Memory","primary_cat":"cs.CV","submitted_at":"2026-06-16T09:47:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ActWorld extends navigation-centric world models to support mid-rollout object interactions via chunk-autoregressive generation, action-aware memory routing, and a persistent memory bank, backed by a 100K annotated interaction dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11129","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WorldOlympiad: Can Your World Model Survive a Triathlon?","primary_cat":"cs.CV","submitted_at":"2026-06-09T17:24:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WorldOlympiad is a new benchmark decomposing world-model evaluation into physical, geometry, and interaction tracks using segmentation, MLLM judges, Gaussian splatting, and action prompts on diverse scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07967","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DisCo: World Models with Discrete Camera Motion Control","primary_cat":"cs.CV","submitted_at":"2026-06-06T03:50:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DisCo uses discrete action primitives for camera control in video world models to achieve more reliable action following than continuous trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07508","ref_index":85,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Streaming Video Generation with Streaming Force Control","primary_cat":"cs.CV","submitted_at":"2026-06-05T17:57:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StreamForce presents a unified causal model for force-controllable streaming video generation using a new force representation and distillation pipeline, claiming SOTA force adherence and 16.6 FPS performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02575","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Zero to Hero: Training-Free Custom Concept Spawning in World Models","primary_cat":"cs.CV","submitted_at":"2026-06-01T17:59:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SPAWN enables training-free insertion of custom visual concepts into autoregressive world models by swapping the pinned context-memory anchor over a short injection window.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01164","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Interactive Video World Modeling: Frontiers, Challenges, Benchmarks, and Future Trends","primary_cat":"cs.CV","submitted_at":"2026-05-31T11:12:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"This survey reviews trends, challenges, benchmarks, and future directions in action-conditioned interactive world modeling for video and 3D generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31336","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DecMem: Towards Minute-Long Consistent World Generation with Decoupled Memory","primary_cat":"cs.CV","submitted_at":"2026-05-29T14:17:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DecMem proposes a decoupled memory system using sparse global and anchored local components to enable consistent minute-long controllable video generation in world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30855","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Robust Dreamer: Deviation-Aware Latent Gaussian Memory for Action-Controlled AR Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-29T05:21:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Robust Dreamer uses Latent Gaussian Memory anchored to diffusion latents and Deviation Learning with a Dynamic Deviation Archive to reduce drift in long-horizon action-controlled image-to-video generation, reporting SOTA results on ScanNet, DL3DV, and OmniWorldGame.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23458","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"One-Forcing: Towards Stable One-Step Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T10:16:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"One-Forcing augments DMD with a GAN loss to enable stable one-step causal autoregressive video generation, reporting a VBench score of 83.76 as SOTA among one-step methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18601","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Incantation: Natural Language as the Action Interface for Multi-Entity Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-18T16:12:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Incantation is the first video world model to use per-frame natural language conditioning for simultaneous multi-entity control and concept-level cross-entity transfer in interactive video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15824","ref_index":45,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization","primary_cat":"cs.CV","submitted_at":"2026-05-15T10:25:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FashionChameleon achieves interactive multi-garment video customization at 23.8 FPS via in-context teacher models, streaming distillation, and training-free KV cache rescheduling while using only single-garment data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15178","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SANA-WM is a 2.6B-parameter efficient world model that synthesizes minute-scale 720p videos with 6-DoF camera control, trained on 213K public clips in 15 days on 64 H100s and runnable on single GPUs at 36x higher throughput than prior open baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09965","ref_index":218,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Generalist Game Players: An Investigation of Foundation Models in the Game Multiverse","primary_cat":"cs.CV","submitted_at":"2026-05-11T04:16:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper organizes research on generalist game AI into Dataset, Model, Harness, and Benchmark pillars and charts a five-level progression from single-game mastery to agents that create and live inside game multiverses.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"models almost operate as black boxes, heavily obscuring the underlying mechanisms that drive their decision- making processes [50, 112]. Ultimately, systems confined to this paradigm fall fundamentally short of the true AGI that AI community pursues. Recently, the advent of Large Foundation Models (LFMs), including Large Language Models (LLMs) [6, 15, 119, 190], Vision-Language Models (VLMs) [7, 29, 118, 141], Vision-Language-Action Models (VLAs) [25, 91, 175, 218], and World Models (WMs) [11, 33, 58, 65, 208], has sparked a transformative paradigm shift. Rather than mastering a single game through billions of trial-and-error episodes from scratch, LFMs were born with vast open-world knowledge and emergent reasoning capabilities. By treating games not as isolated optimization problems, but as diverse instances of interactive environments, these models show the"},{"citing_arxiv_id":"2604.21686","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WorldMark: A Unified Benchmark Suite for Interactive Video World Models","primary_cat":"cs.CV","submitted_at":"2026-04-23T13:50:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WorldMark is the first public benchmark that standardizes scenes, trajectories, and control interfaces across heterogeneous interactive image-to-video world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13036","ref_index":138,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lyra 2.0: Explorable Generative 3D Worlds","primary_cat":"cs.CV","submitted_at":"2026-04-14T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lyra 2.0 produces persistent 3D-consistent video sequences for large explorable worlds by using per-frame geometry for information routing and self-augmented training to correct temporal drift.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Xu, and C. Zou. Spatialcrafter: Unleashing the imagination of video diffusion models for scene reconstruction from limited observations.arXiv preprint arXiv:2505.11992, 2025. 18 [137] T. Zhang, S. Bi, Y. Hong, K. Zhang, F. Luan, S. Yang, K. Sunkavalli, W. T. Freeman, and H. Tan. Test-time training done right.arXiv preprint arXiv:2505.23884, 2025. 4 [138] Y. Zhang, C. Peng, B. Wang, P. Wang, Q. Zhu, F. Kang, B. Jiang, Z. Gao, E. Li, Y. Liu, et al. Matrix-game: Interactive world foundation model.arXiv preprint arXiv:2506.18701, 2025. 3 [139] J. Zhao, F. Wei, Z. Liu, H. Zhang, C. Xu, and Y. Lu. Spatia: Video generation with updatable spatial memory.arXiv preprint arXiv:2512.15716, 2025. 2, 3, 4 [140] W."},{"citing_arxiv_id":"2604.08995","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Matrix-Game 3.0: Real-Time and Streaming Interactive World Model with Long-Horizon Memory","primary_cat":"cs.CV","submitted_at":"2026-04-10T06:00:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Matrix-Game 3.0 delivers 720p real-time video generation at 40 FPS with minute-scale memory consistency by combining residual self-correction training, camera-aware memory injection, and DMD-based autoregressive distillation on a 5B model.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Wang, Vishal M Patel, Paul Pu Liang, et al. World-in-world: World models in a closed-loop world.arXiv preprint arXiv:2510.18135, 2025. [55] Songchun Zhang, Zeyue Xue, Siming Fu, Jie Huang, Xianghao Kong, Y Ma, Haoyang Huang, Nan Duan, and Anyi Rao. Astrolabe: Steering forward-process reinforcement learning for distilled autoregressive video models.arXiv preprint arXiv:2603.17051, 2026. [56] Yifan Zhang, Chunli Peng, Boyang Wang, Puyi Wang, Qingcheng Zhu, Fei Kang, Biao Jiang, Zedong Gao, Eric Li, Yang Liu, and Yahui Zhou. Matrix-game: Interactive world foundation model.arXiv preprint arXiv:2506.18701, 2025. [57] Guangcong Zheng, Teng Li, Xianpan Zhou, and Xi Li. Realcam-vid: High-resolution video dataset with dynamic scenes and metric-scale camera movements, 2025."},{"citing_arxiv_id":"2604.04707","ref_index":157,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OpenWorldLib: A Unified Codebase and Definition of Advanced World Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T14:19:48+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"with historical prediction and planning. InProceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pages 6854-6863, June 2025. [156] Junyi Zhang, Charles Herrmann, Junhwa Hur, Chen Sun, Ming-Hsuan Yang, Forrester Cole, Trevor Darrell, and Deqing Sun. Loger: Long-context geometric reconstruction with hybrid memory.arXiv preprintarXiv:2603.03269, 2026. [157] Yifan Zhang, Chunli Peng, Boyang Wang, Puyi Wang, Qingcheng Zhu, Fei Kang, Biao Jiang, Zedong Gao, Eric Li, Yang Liu, et al. Matrix-game: Interactive world foundation model.arXiv preprint arXiv:2506.18701, 2025. [158] Yumeng Zhang, Shi Gong, Kaixin Xiong, Xiaoqing Ye, Xiaofan Li, Xiao Tan, Fan Wang, Jizhou Huang, Hua Wu, and Haifeng Wang. Bevworld: A multimodal world simulator for autonomous driving via scene-level bev"},{"citing_arxiv_id":"2604.02799","ref_index":81,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UNICA: A Unified Neural Framework for Controllable 3D Avatars","primary_cat":"cs.CV","submitted_at":"2026-04-03T07:09:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UNICA unifies motion planning, rigging, physical simulation, and rendering into a single skeleton-free neural framework that produces next-frame 3D avatar geometry from action inputs and renders it with Gaussian splatting.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"However, these methods operate at the skeletal level and must be combined with separate rigging and physical simulation. World Models.World models are systems designed to understand and predict the evolution of an environment given historical observations [12]. Game simula- tion is an ongoing research direction for world models, predicting future scenes conditioned on action inputs [7,21,29,81]. Closely related to our idea, GameN- Gen [64] fits an entire first-person video game with a multi-frame diffusion model conditioned on historical frames and player actions. State-of-the-art world mod- els [4,17,39,43,60] have demonstrated the ability to synthesize videos across diverse domains, with some approaches including third-person human motion."},{"citing_arxiv_id":"2603.28980","ref_index":74,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stepper: Stepwise Immersive Scene Generation with Multiview Panoramas","primary_cat":"cs.CV","submitted_at":"2026-03-30T20:26:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stepper uses stepwise panoramic expansion with a multi-view 360-degree diffusion model and geometry reconstruction to produce high-fidelity, structurally consistent immersive 3D scenes from text.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.28489","ref_index":229,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Video Generation Models as World Models: Efficient Paradigms, Architectures and Algorithms","primary_cat":"eess.IV","submitted_at":"2026-03-30T14:23:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video generation models can function as world simulators if efficiency gaps in spatiotemporal modeling are bridged via organized paradigms, architectures, and algorithms.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"DreamDojo [211] GR-1 [212], VILP [213], UV A [214], RoboEnvision [215], GEVRM [216], EnerVerse [217], LingBot-V A [218], Cosmos Policy [219],Fast-W AM [220],LeWorld- Model [221],DreamZero [222] Game & Interactive World Simulation GameGen-X [223], GameFactory [224], MineWorld [225], Matrix-Game [42], [226], GenieRedux-G [227], Hunyuan-GameCraft [228], [229], PlayGen [230], WorldPlay [231], Yume1.5 [129], LingBot-World [232], Cosmos-Predict2.5 [43], Dreamer 4 [233], Genie 3 [21] but also long-horizon and interactive generation. In practice, this means that parallelism, caching, pruning, and quantization should work together rather than be applied separately. Future methods should therefore improve both efficiency and stability,"},{"citing_arxiv_id":"2603.11911","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InSpatio-WorldFM: An Open-Source Real-Time Generative Frame Model","primary_cat":"cs.CV","submitted_at":"2026-03-12T13:28:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InSpatio-WorldFM is a frame-independent generative model that uses explicit 3D anchors and spatial memory to deliver real-time multi-view consistent spatial intelligence via a three-stage training pipeline from pretrained diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.20540","ref_index":89,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Advancing Open-source World Models","primary_cat":"cs.CV","submitted_at":"2026-01-28T12:37:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LingBot-World is presented as an open-source world model that delivers high-fidelity simulation, minute-level contextual consistency, and real-time interactivity under one second latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.22940","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"One-to-All Animation: Alignment-Free Character Animation and Image Pose Transfer","primary_cat":"cs.CV","submitted_at":"2025-11-28T07:30:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"One-to-All Animation enables alignment-free character animation and image pose transfer via self-supervised outpainting reformulation, reference extraction, hybrid fusion attention, identity-robust pose control, and token replacement for long videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.22622","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LongLive: Real-time Interactive Long Video Generation","primary_cat":"cs.CV","submitted_at":"2025-09-26T17:48:24+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LongLive is a causal autoregressive video generator that produces up to 240-second interactive videos at 20.7 FPS on one H100 GPU after 32 GPU-days of fine-tuning from a 1.3B short-clip model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.13009","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Matrix-game 2.0: An open-source real-time and streaming interactive world model","primary_cat":"cs.CV","submitted_at":"2025-08-18T15:28:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Matrix-Game 2.0 introduces a scalable data pipeline, action-injection module, and few-step distillation to enable real-time streaming video generation at 25 FPS from game-engine interactions, with open-sourced weights and code.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}