{"total":12,"items":[{"citing_arxiv_id":"2607.02503","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VT-WAM: Visual-Tactile World Action Model for Contact-Rich Manipulation","primary_cat":"cs.RO","submitted_at":"2026-07-02T17:58:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VT-WAM jointly predicts visual futures, tactile deformation, and actions via flow matching with Asymmetric MoT attention and contact-gated AVTAG, reporting 71.67% success on six real-world contact-rich tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02501","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Embodied.cpp: A Portable Inference Runtime of Embodied AI Models on Heterogeneous Robots","primary_cat":"cs.RO","submitted_at":"2026-07-02T17:58:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Embodied.cpp introduces a portable C++ inference runtime with modular layers for deploying VLA and WAM models on heterogeneous robots, reporting 100% and 91% task success on two models plus memory reduction on a WAM benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02195","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridge-WA: Predicting Where and How the World Changes for Robotic Action","primary_cat":"cs.RO","submitted_at":"2026-07-02T14:03:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Bridge-WA introduces a lightweight distillation-based world-action model that uses future-change priors to improve robotic task success and robustness without deployment-time dense rollouts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13674","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RepWAM: World Action Modeling with Representation Visual-Action Tokenizers","primary_cat":"cs.CV","submitted_at":"2026-06-11T17:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RepWAM introduces representation visual-action tokenizers to pretrain world action models that jointly model future visual states and latent actions under instructions for improved robot manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12028","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VICX: Generalizable Robot Manipulation via Video Generation and In-Context Operator Network","primary_cat":"cs.RO","submitted_at":"2026-06-10T12:51:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VICX decouples frozen video-based visual planning from in-context visual-to-trajectory mapping via V2T-ICON to achieve cross-task and cross-embodiment generalization in robot manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10040","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient-WAM: A 1B-Parameter World-Action Model with Low-Cost Future Imagination","primary_cat":"cs.RO","submitted_at":"2026-06-08T18:14:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Efficient-WAM delivers 30x lower latency than prior WAMs at 100 ms per chunk while keeping competitive manipulation performance by treating coarse future video as guidance rather than high-fidelity output.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09457","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$\\omega$-EVA: Envision, Verify, and Act with Latent Interactive World Models","primary_cat":"cs.RO","submitted_at":"2026-06-08T13:12:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ω-EVA is a three-stage latent world model framework that trains action-conditioned dynamics, a language-conditioned flow policy, and a tri-branch refiner to improve embodied action generation in simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07089","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dreaming when Necessary: Advancing World Action Models with Adaptive Multi-Modal Reasoning","primary_cat":"cs.RO","submitted_at":"2026-06-05T09:35:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AdaWAM introduces an adaptive router that triggers textual or visual reasoning as needed in world action models, claiming better efficiency and performance than prior embodied policies on simulated and real tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03371","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"See, Infer, Intervene: Proactive World Modeling for Goal-Oriented Social Intelligence","primary_cat":"cs.CL","submitted_at":"2026-06-02T09:18:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces SII framework and PIWM using AIDA and BDI models to predict intent transitions and select from five intervention classes, reporting 0.641 macro F1 with ground-truth state on a new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01205","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ImagineUAV: Aerial Vision-Language Navigation via World-Action Modeling and Kinodynamic Planning","primary_cat":"cs.RO","submitted_at":"2026-05-31T12:39:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ImagineUAV is a 1.3B-parameter cascaded world-action framework that generates instruction-conditioned future observations via latent video diffusion, infers motions, and applies kinodynamic planning to outperform VLN/VLA baselines in aerial navigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27947","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SANTS: A State-Adaptive Scheduler for World Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-27T04:40:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SANTS adaptively chooses denoising depth in video-based robot action diffusion policies using a state-dependent stopping hazard and noise ratio, trained via downstream action reward to reduce latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28865","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Emergent Semantic Representations in World Models through Physical Interaction without Linguistic Supervision","primary_cat":"cs.LG","submitted_at":"2026-05-22T03:31:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VAE world model trained on embodied exploration develops latent representations aligned with physical geometry, with metrics improving together and collapsing together under high KL regularization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}