{"total":18,"items":[{"citing_arxiv_id":"2605.13591","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Real2Sim: A Physics-driven and Editable Gaussian Splatting Framework for Autonomous Driving Scenes","primary_cat":"cs.CV","submitted_at":"2026-05-13T14:26:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Real2Sim reconstructs editable dynamic driving scenes as temporally continuous Gaussians integrated with a differentiable MPM physics solver for high-fidelity simulation of interactions and collisions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11596","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HorizonDrive: Self-Corrective Autoregressive World Model for Long-horizon Driving Simulation","primary_cat":"cs.CV","submitted_at":"2026-05-12T06:22:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HorizonDrive enables stable long-horizon autoregressive driving simulation via anti-drifting teacher training with scheduled rollout recovery and teacher rollout distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10858","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Is Your Driving World Model an All-Around Player?","primary_cat":"cs.CV","submitted_at":"2026-05-11T17:05:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WorldLens benchmark reveals no driving world model dominates across visual, geometric, behavioral, and perceptual fidelity, with contributions of a 26K human-annotated dataset and a distilled vision-language evaluator.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01896","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Divide and Conquer: Decoupled Representation Alignment for Multimodal World Models","primary_cat":"cs.CV","submitted_at":"2026-05-03T14:22:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"M²-REPA decouples modality-specific features inside a diffusion model and aligns each to its matching expert foundation model via an alignment loss plus a decoupling regularizer, yielding better visual quality and long-term consistency in multi-modal video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01694","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent State Design for World Models under Sufficiency Constraints","primary_cat":"cs.AI","submitted_at":"2026-05-03T03:19:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"World models succeed when their latent states are built to meet task-specific sufficiency constraints rather than preserving the maximum amount of information.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27448","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LA-Pose: Latent Action Pretraining Meets Pose Estimation","primary_cat":"cs.CV","submitted_at":"2026-04-30T05:43:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LA-Pose achieves over 10% higher pose accuracy than recent feed-forward methods on Waymo and PandaSet benchmarks by repurposing latent actions from self-supervised inverse-dynamics pretraining while using orders of magnitude less labeled 3D data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21914","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VistaBot: View-Robust Robot Manipulation via Spatiotemporal-Aware View Synthesis","primary_cat":"cs.RO","submitted_at":"2026-04-23T17:57:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VistaBot integrates 4D geometry estimation and spatiotemporal view synthesis into action policies to improve cross-view generalization by 2.6-2.8x on a new VGS metric in simulation and real tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18564","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MultiWorld: Scalable Multi-Agent Multi-View Video World Models","primary_cat":"cs.CV","submitted_at":"2026-04-20T17:52:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiWorld is a scalable framework for multi-agent multi-view video world models that improves controllability and consistency over single-agent baselines in game and robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18468","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Asset Harvester: Extracting 3D Assets from Autonomous Driving Logs for Simulation","primary_cat":"cs.CV","submitted_at":"2026-04-20T16:20:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Asset Harvester converts sparse in-the-wild object observations from AV driving logs into complete simulation-ready 3D assets via data curation, geometry-aware preprocessing, and a SparseViewDiT model that couples sparse-view multiview generation with 3D Gaussian lifting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17147","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ScenarioControl: Vision-Language Controllable Vectorized Latent Scenario Generation","primary_cat":"cs.CV","submitted_at":"2026-04-18T21:00:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ScenarioControl introduces the first vision-language controllable generator for realistic vectorized 3D driving scenarios with temporal consistency across actor views.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16592","ref_index":143,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Human Cognition in Machines: A Unified Perspective of World Models","primary_cat":"cs.RO","submitted_at":"2026-04-17T17:51:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces a unified framework for world models that fully incorporates all cognitive functions from Cognitive Architecture Theory, highlights under-researched areas in motivation and meta-cognition, and proposes Epistemic World Models as a new category for scientific discovery agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12857","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Artificial Intelligence for Modeling and Simulation of Mixed Automated and Human Traffic","primary_cat":"cs.AI","submitted_at":"2026-04-14T15:09:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"This survey synthesizes AI techniques for mixed autonomy traffic simulation and introduces a taxonomy spanning agent-level behavior models, environment-level methods, and cognitive/physics-informed approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11707","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Representations Before Pixels: Semantics-Guided Hierarchical Video Prediction","primary_cat":"cs.CV","submitted_at":"2026-04-13T16:42:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Re2Pix decomposes video prediction into semantic feature forecasting followed by representation-conditioned diffusion synthesis, with nested dropout and mixed supervision to handle prediction errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10959","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Ozone: A Unified Platform for Transportation Research","primary_cat":"cs.DB","submitted_at":"2026-04-13T03:55:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Ozone unifies four trajectory datasets into a canonical format with standardized schemas and provides CARLA-based benchmarking, claiming 85% faster experiment setup and 91% cross-city transfer efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08719","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LMGenDrive: Bridging Multimodal Understanding and Generative World Modeling for End-to-End Driving","primary_cat":"cs.CV","submitted_at":"2026-04-09T19:13:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LMGenDrive unifies LLM-based multimodal understanding with generative world models to output both future driving videos and control signals for end-to-end closed-loop autonomous driving.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04707","ref_index":104,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OpenWorldLib: A Unified Codebase and Definition of Advanced World Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T14:19:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"OpenWorldLib offers a standardized codebase and definition for world models that combine perception, interaction, and memory to understand and predict the world.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.28489","ref_index":172,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Video Generation Models as World Models: Efficient Paradigms, Architectures and Algorithms","primary_cat":"eess.IV","submitted_at":"2026-03-30T14:23:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video generation models can function as world simulators if efficiency gaps in spatiotemporal modeling are bridged via organized paradigms, architectures, and algorithms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.09985","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning","primary_cat":"cs.AI","submitted_at":"2025-06-11T17:57:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"V-JEPA 2 pre-trained on massive unlabeled video achieves strong results on motion understanding and action anticipation, SOTA video QA at 8B scale, and enables zero-shot robotic planning on Franka arms using only 62 hours of unlabeled robot video.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}