{"total":19,"items":[{"citing_arxiv_id":"2605.13013","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"JEDI: Joint Embedding Diffusion World Model for Online Model-Based Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-13T05:07:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"JEDI is the first online end-to-end latent diffusion world model that trains latents from denoising loss rather than reconstruction, achieving competitive Atari100k results with 43% less VRAM and over 3x faster sampling than pixel diffusion baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09693","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Do multimodal models imagine electric sheep?","primary_cat":"cs.CV","submitted_at":"2026-05-10T18:25:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fine-tuning VLMs to output action sequences for puzzles causes emergent internal visual representations that improve performance when integrated into reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07554","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ProteinJEPA: Latent prediction complements protein language models","primary_cat":"cs.LG","submitted_at":"2026-05-08T10:30:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Masked-position MLM plus JEPA latent prediction outperforms MLM-only pretraining on 10-11 of 16 downstream tasks for 35M-150M protein models while JEPA alone fails.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07514","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Is the Future Compatible? Diagnosing Dynamic Consistency in World Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-08T09:44:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Action-state consistency in World Action Models distinguishes successful from failed imagined futures and supports value-free selection of better rollouts via consensus among predictions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07390","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ST-Gen4D: Embedding 4D Spatiotemporal Cognition into World Model for 4D Generation","primary_cat":"cs.CV","submitted_at":"2026-05-08T07:44:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ST-Gen4D uses a world model that fuses global appearance and local dynamic graphs into a 4D cognition representation to guide consistent 4D Gaussian generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07278","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Predictive but Not Plannable: RC-aux for Latent World Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T05:43:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RC-aux corrects spatiotemporal mismatch in reconstruction-free latent world models by adding multi-horizon prediction and reachability supervision, improving planning performance on goal-conditioned pixel-control tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06841","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AGWM: Affordance-Grounded World Models for Environments with Compositional Prerequisites","primary_cat":"cs.AI","submitted_at":"2026-05-07T18:46:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AGWM improves world model accuracy in compositional environments by learning an explicit DAG of action affordance prerequisites to handle dynamic executability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06298","ref_index":16,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Render, Don't Decode: Weight-Space World Models with Latent Structural Disentanglement","primary_cat":"cs.CV","submitted_at":"2026-05-07T14:02:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NOVA represents world states as INR weights for decoder-free rendering, compactness, and unsupervised disentanglement of background, foreground, and motion in video world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05586","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AeroJEPA: Learning Semantic Latent Representations for Scalable 3D Aerodynamic Field Modeling","primary_cat":"cs.LG","submitted_at":"2026-05-07T02:11:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AeroJEPA applies joint-embedding predictive learning to produce scalable, semantically organized latent representations for 3D aerodynamic fields that support both field reconstruction and downstream design tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03413","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning to Theorize the World from Observation","primary_cat":"cs.LG","submitted_at":"2026-05-05T06:39:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NEO induces compositional latent programs as world theories from observations and executes them to enable explanation-driven generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01694","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent State Design for World Models under Sufficiency Constraints","primary_cat":"cs.AI","submitted_at":"2026-05-03T03:19:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"World models succeed when their latent states are built to meet task-specific sufficiency constraints rather than preserving the maximum amount of information.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00080","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"World Model for Robot Learning: A Comprehensive Survey","primary_cat":"cs.RO","submitted_at":"2026-04-30T14:35:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A comprehensive survey that organizes the literature on world models in robot learning, their roles in policy learning, planning, simulation, and video-based generation, with connections to navigation, driving, datasets, and benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27411","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Detecting is Easy, Adapting is Hard: Local Expert Growth for Visual Model-Based Reinforcement Learning under Distribution Shift","primary_cat":"cs.LG","submitted_at":"2026-04-30T04:28:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JEPA-Indexed Local Expert Growth adds local action corrections for detected shift clusters and yields statistically significant OOD gains on four shift conditions while keeping in-distribution performance intact.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24662","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Information bottleneck for learning the phase space of dynamics from high-dimensional experimental data","primary_cat":"physics.data-an","submitted_at":"2026-04-27T16:24:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DySIB recovers a two-dimensional representation matching the phase space of a physical pendulum from high-dimensional video data by maximizing predictive mutual information in latent space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18058","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sonata: A Hybrid World Model for Inertial Kinematics under Clinical Data Scarcity","primary_cat":"cs.LG","submitted_at":"2026-04-20T10:26:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sonata is a small hybrid world model pre-trained to predict future IMU states that outperforms autoregressive baselines on clinical discrimination, fall-risk prediction, and cross-cohort transfer while fitting on-device wearables.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11302","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"3D-Anchored Lookahead Planning for Persistent Robotic Scene Memory via World-Model-Based MCTS","primary_cat":"cs.RO","submitted_at":"2026-04-13T11:01:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"3D-ALP achieves 0.65 success on memory-dependent 5-step robotic reach tasks versus near-zero for reactive baselines by anchoring MCTS planning to a persistent 3D camera-to-world frame.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05157","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"IntentScore: Intent-Conditioned Action Evaluation for Computer-Use Agents","primary_cat":"cs.AI","submitted_at":"2026-04-06T20:39:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IntentScore learns intent-conditioned action scores from offline GUI trajectories and raises task success by 6.9 points on an unseen agent and environment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.29496","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Metriplector: From Field Theory to Neural Architecture","primary_cat":"cs.AI","submitted_at":"2026-03-31T09:40:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Metriplector treats neural computation as coupled metriplectic field dynamics whose stress-energy tensor readout achieves competitive results on vision, control, Sudoku, language modeling, and pathfinding with small parameter counts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.28489","ref_index":224,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Video Generation Models as World Models: Efficient Paradigms, Architectures and Algorithms","primary_cat":"eess.IV","submitted_at":"2026-03-30T14:23:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video generation models can function as world simulators if efficiency gaps in spatiotemporal modeling are bridged via organized paradigms, architectures, and algorithms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}