{"total":25,"items":[{"citing_arxiv_id":"2605.23699","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CRONOS: Benchmarking Counterfactual Physical Consistency in Video Models","primary_cat":"cs.CV","submitted_at":"2026-05-22T14:51:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CRONOS benchmark shows recent open-source video generators fail to preserve physical consistency under controlled changes to viewpoint, scene, object category, and appearance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20299","ref_index":99,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mechanisms of Misgeneralization in Physical Sequence Modeling","primary_cat":"cs.LG","submitted_at":"2026-05-19T12:34:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Generative sequence models for physical tasks exhibit physical misgeneralization where local prediction errors propagate through physical measurements to distort aggregate distributions over quantities like distance or energy; a data deviation kernel explains and predicts the shifts and supports a内核","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19728","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Aero-World: Action-Conditioned Aerial Video Generation from Inertial Controls","primary_cat":"cs.CV","submitted_at":"2026-05-19T12:02:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Aero-World adapts a pretrained latent diffusion transformer for action-conditioned aerial video generation by injecting inertial action tokens and using a frozen latent-space Physics Probe for inertial consistency supervision during LoRA finetuning, with a new AeroBench benchmark showing improved AA","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18396","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"NEWTON: Agentic Planning for Physically Grounded Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T13:42:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NEWTON improves physical accuracy in video generation by deploying a trainable planner that coordinates physics-aware tools and a verifier, raising joint accuracy on VideoPhy-2 without altering the base generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18233","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Enhancing Train-Free Infinite-Frame Generation for Consistent Long Videos","primary_cat":"cs.CV","submitted_at":"2026-05-18T11:28:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIGA introduces two-stage alignment to close train-inference gaps and dual consistency enhancement via self-reflection and long-range guidance to achieve SOTA temporal consistency in infinite-frame video generation on VBench and NarrLV.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15964","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WorldVLN: Autoregressive World Action Model for Aerial Vision-Language Navigation","primary_cat":"cs.RO","submitted_at":"2026-05-15T13:55:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WorldVLN proposes the first autoregressive world action model for aerial vision-language navigation that predicts short-horizon latent world states, decodes them to waypoints in closed loop, and uses two-stage training with Action-aware GRPO to achieve over 12% success-rate gains on benchmarks plus零","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15458","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Video Models Can Reason with Verifiable Rewards","primary_cat":"cs.CV","submitted_at":"2026-05-14T22:40:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VideoRLVR uses SDE-GRPO optimization, dense decomposed rewards, and Early-Step Focus to train video diffusion models on verifiable reasoning tasks, outperforming supervised fine-tuning and other video generators on Maze, FlowFree, and Sokoban.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14136","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TeDiO: Temporal Diagonal Optimization for Training-Free Coherent Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-13T21:39:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TeDiO regularizes temporal diagonals in diffusion transformer attention maps to produce smoother video motion while keeping per-frame quality intact.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10434","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WorldReasonBench: Human-Aligned Stress Testing of Video Generators as Future World-State Predictors","primary_cat":"cs.CV","submitted_at":"2026-05-11T12:06:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper presents WorldReasonBench, a benchmark that tests video generators on maintaining physical, social, logical, and informational consistency when predicting future states from initial conditions and actions.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"and Veo improve in long-horizon coherence, controllability, and realism [3, 26], with recent studies even suggesting zero-shot learning and reasoning-like behavior in selected settings [26]. Capability demos alone do not establish robust world understanding, however: physical-law analyses show that even strong models fail on gravity, object permanence, and causal consistency [8]. We therefore aim to test these claims systematically rather than infer them from isolated examples. Benchmarks and automatic evaluation for video generation.Existing video benchmarks mostly target perceptual quality or prompt alignment via reference metrics (FID [6], FVD [22], LPIPS [29]) and aesthetics/compositionality suites [7, 30, 12, 13, 19], none of which provide structured reasoning"},{"citing_arxiv_id":"2605.08567","ref_index":13,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ACWM-Phys: Investigating Generalized Physical Interaction in Action-Conditioned Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-09T00:00:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ACWM-Phys is a controllable simulator benchmark with in- and out-of-distribution protocols for evaluating action-conditioned world models across rigid, kinematic, deformable, and particle dynamics.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"limitation of these approaches is their limited investigation of complex physical interactions, as most mainly focus on simple navigation, or rigid-body dynamics such as picking, pushing, and grasping. Physics in Video Diﬀusion Models Recent work has begun to investigate how well video diﬀusion models capture physical principles and whether they can serve as implicit world models [ 13, 33, 22, 37], and further align current video diﬀusion to certain physics scenes [ 32, 38, 36, 17]. These studies examine aspects such as physical law consistency [ 36, 17], intuitive physics [ 32, 18], and physical reasoning ability [ 38, 39] in generated videos, providing useful evidence on the current strengths and limitations of video generation"},{"citing_arxiv_id":"2605.06298","ref_index":12,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Render, Don't Decode: Weight-Space World Models with Latent Structural Disentanglement","primary_cat":"cs.CV","submitted_at":"2026-05-07T14:02:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NOVA represents world states as INR weights for decoder-free rendering, compactness, and unsupervised disentanglement of background, foreground, and motion in video world models.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"9Note that the output layer is included in the layer count; meaning that 6 layers corresponds to a depth of 5, following Equinox's convention [Kidger and Garcia, 2021]. 17 (e) Forward Dynamics Model.Two independent 4-layer MLPs: A of width 2dz taking zt as input, andBof width2d z takingu t as input. Their outputs are summed to producez t+1. (x,y)∈R2 Fourier EncodeFreqs=6 Linear(12)+ ReLU Linear(12)+ ReLU Linear(12)+ ReLU Linear(12)+ ReLU Linear(12)+ ReLU Linear(C) ox,yt ∈RC weights & biases=¯z+zt ot (H×W×C) Conv2D (s=2)C= 64+ ReLU Conv2D (s=2)C= 128+ ReLU Conv2D (s=2)C= 256+ ReLU Conv2D (s=2)C= 512+ ReLU Flatten &Linear(dz) zt∈Rdz zt zt+1 Concat Linear(dz)+ ReLU Linear(dz)+ ReLU Linear(dz)+ ReLU Linear(du) ut∈Rdu z1:t u1:t−1"},{"citing_arxiv_id":"2605.03413","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Learning to Theorize the World from Observation","primary_cat":"cs.LG","submitted_at":"2026-05-05T06:39:12+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"This baseline represents a program as a single continuous latent vector using a conditional β-V AE architecture (Higgins et al., 2016). Given an observation pair(x, y), the encoder produces a Gaussian posterior qϕ(z|x, y) =N(µ ϕ(x, y), σ2 ϕ(x, y)), where z∈R d serves as the program representation. The decoder reconstructs yfromxandz. Training maximizes the ELBO: L=E qϕ(z|x,y) [logp θ(y|x, z)]−βKL(q ϕ(z|x, y)∥p(z)),(8) where p(z) =N(0, I) is a standard Gaussian prior andβ controls the strength of disentanglement pressure. Inference is fully amortized: a single forward pass through the encoder produces z without iterative refinement. This architecture corresponds to world models such as AdaWorld (Gao et al., 2025), which learn continuous latent dynamics from observations."},{"citing_arxiv_id":"2605.01950","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TRAP: Tail-aware Ranking Attack for World-Model Planning","primary_cat":"cs.LG","submitted_at":"2026-05-03T16:19:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TRAP is a tail-aware ranking attack that plants a backdoor in world models so that a trigger causes the model to reorder a few critical imagined trajectories and redirect planning while preserving normal behavior on clean inputs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[25] Bingyi Kang, Yang Yue, Rui Lu, Zhijie Lin, Yang Zhao, Kaixin Wang, Gao Huang, and Jiashi Feng. 2024. How far is video generation from world model: A physical law perspective.arXiv preprint arXiv:2411.02385(2024). [26] Danny Karmon, Daniel Zoran, and Yoav Goldberg. 2018. Lavan: Localized and visible adversarial noise. InInternational conference on machine learning. PMLR, 2507-2515. [27] Panagiota Kiourti, Kacper Wardega, Susmit Jha, and Wenchao Li. 2020. Trojdrl: evaluation of backdoor attacks on deep reinforcement learning. In2020 57th ACM/IEEE Design Automation Conference (DAC). IEEE, 1-6. [28] Jiajian Li, Qi Wang, Yunbo Wang, Xin Jin, Yang Li, Wenjun Zeng, and Xiaokang Yang. 2024. Open-world reinforcement learning over long short-term imagina-"},{"citing_arxiv_id":"2605.08115","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Alice v1: Distillation-Enhanced Video Generation Surpassing Closed-Source Models","primary_cat":"cs.GR","submitted_at":"2026-04-27T23:37:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Alice v1 is an open video model that surpasses its teacher and closed-source systems like Veo3 and Sora2 in quality while running 7x faster through specialized distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07990","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SceneScribe-1M: A Large-Scale Video Dataset with Comprehensive Geometric and Semantic Annotations","primary_cat":"cs.CV","submitted_at":"2026-04-09T08:59:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SceneScribe-1M is a new dataset of 1 million videos with semantic text, camera parameters, dense depth, and consistent 3D point tracks to support monocular depth estimation, scene reconstruction, point tracking, and text-to-video synthesis.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"V oyager: Long-range and world-consistent video diffu- sion for explorable 3d scene generation.arXiv preprint arXiv:2506.04225, 2025. 2 [23] Aaron Hurst, Adam Lerer, Adam P Goucher, Adam Perel- man, Aditya Ramesh, Aidan Clark, AJ Ostrow, Akila Weli- hinda, Alan Hayes, Alec Radford, et al. Gpt-4o system card. arXiv preprint arXiv:2410.21276, 2024. 5 [24] Bingyi Kang, Yang Yue, Rui Lu, Zhijie Lin, Yang Zhao, Kaixin Wang, Gao Huang, and Jiashi Feng. How far is video generation from world model: A physical law perspective. arXiv preprint arXiv:2411.02385, 2024. 3 [25] Nikita Karaev, Ignacio Rocco, Benjamin Graham, Natalia Neverova, Andrea Vedaldi, and Christian Rupprecht. Dy- namicstereo: Consistent dynamic depth from stereo videos."},{"citing_arxiv_id":"2602.13294","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VisPhyWorld: Probing Physical Reasoning via Code-Driven Video Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-02-09T05:46:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VisPhyWorld evaluates MLLMs' physical reasoning via executable code generation for video reconstruction, with VisPhyBench showing strong semantics but weak parameter inference and dynamics simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.13609","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Do-Undo Bench: Reversibility for Action Understanding in Image Generation","primary_cat":"cs.CV","submitted_at":"2025-12-15T18:03:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Do-Undo Bench is a new evaluation task and dataset that forces models to simulate forward action effects and then undo them to measure genuine action understanding in image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.05564","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ProPhy: Progressive Physical Alignment for Dynamic World Simulation","primary_cat":"cs.CV","submitted_at":"2025-12-05T09:39:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProPhy adds explicit physics-aware conditioning via semantic and refinement experts plus VLM knowledge transfer to produce more physically coherent dynamic videos than prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.18373","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MASS: Motion-Aware Spatial-Temporal Grounding for Physics Reasoning and Comprehension in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2025-11-23T09:43:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MASS adds spatiotemporal motion signals and 3D grounding to VLMs and releases MASS-Bench, yielding physics-reasoning performance within 2% of Gemini-2.5-Flash after reinforcement fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.26782","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cloning Deterministic Worlds: The Critical Role of Latent Geometry in Long-Horizon World Models","primary_cat":"cs.LG","submitted_at":"2025-10-30T17:56:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GRWM uses temporal contrastive learning to geometrically regularize latent spaces in world models for high-fidelity cloning of deterministic 3D worlds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.20328","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Video models are zero-shot learners and reasoners","primary_cat":"cs.LG","submitted_at":"2025-09-24T17:17:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Generative video models exhibit emergent zero-shot capabilities across perception, manipulation, and basic reasoning tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[44] Fanqing Meng, Jiaqi Liao, Xinyu Tan, Wenqi Shao, Quanfeng Lu, Kaipeng Zhang, Yu Cheng, Dianqi Li, Yu Qiao, and Ping Luo. Towards world simulator: Crafting physical commonsense- based benchmark for video generation, 2024. [45] Bingyi Kang, Yang Yue, Rui Lu, Zhijie Lin, Yang Zhao, Kaixin Wang, Gao Huang, and Jiashi Feng. How far is video generation from world model: A physical law perspective.arXiv preprint arXiv:2411.02385, 2024. [46] Saman Motamed, Laura Culp, Kevin Swersky, Priyank Jaini, and Robert Geirhos. Do generative video models understand physical principles?arXiv preprint arXiv:2501.09038, 2025. [47] Daochang Liu, Junyu Zhang, Anh-Dung Dinh, Eunbyung Park, Shichao Zhang, and Chang Xu. Generative physical AI in vision: A survey.arXiv preprint arXiv:2501.10928, 2025. [48] Luca M Schulze Buschoff, Elif Akata, Matthias Bethge, and Eric Schulz."},{"citing_arxiv_id":"2506.09981","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ReSim: Reliable World Simulation for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2025-06-11T17:55:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReSim is a controllable video world model trained on heterogeneous real and simulated driving data that achieves higher fidelity and controllability for both expert and non-expert actions, plus a Video2Reward module for estimating action quality from simulated futures.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"(c) The high-fidelity prediction, accurate action-following, and reward estimation abilities of ReSim facilitate driving applications related to both policy deployment and simulation. This imbalance leads to severe hallucinations when the world model is exposed to unseen non-expert actions in certain states, undermining its robustness and reliability [27, 28]. To address the problem, we present ReSim, a reliable driving world model that can be steered by various actions, including out-of-distribution ones, while achieving high-fidelity simulation results. Our approach first enriches real-world human driving logs with non-expert data gathered from a driving simulator [ 29], where agents can execute a broader spectrum of actions without safety"},{"citing_arxiv_id":"2505.12705","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DreamGen: Unlocking Generalization in Robot Learning through Video World Models","primary_cat":"cs.RO","submitted_at":"2025-05-19T04:55:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DreamGen trains robot policies on synthetic trajectories from adapted video world models, enabling a humanoid robot to perform 22 new behaviors in seen and unseen environments from a single pick-and-place teleoperation dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.09038","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do generative video models understand physical principles?","primary_cat":"cs.CV","submitted_at":"2025-01-14T20:59:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Physics-IQ benchmark reveals that generative video models exhibit limited physical understanding unrelated to their visual quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.03575","ref_index":88,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cosmos World Foundation Model Platform for Physical AI","primary_cat":"cs.CV","submitted_at":"2025-01-07T06:55:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The Cosmos platform supplies open-source pre-trained world models and supporting tools for building fine-tunable digital world simulations to train Physical AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}