{"total":15,"items":[{"citing_arxiv_id":"2606.16776","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"JoyAI-Sim: A Simulation-Enabled Interconversion Toolchain for the Embodied Data Pyramid","primary_cat":"cs.RO","submitted_at":"2026-06-15T14:21:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"JoyAI-Sim provides bidirectional Robot-Simulation-Human pathways for aligned model evaluation and data generation in robotics using the JoySim simulator as an evaluation layer and physical consistency filter.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07107","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Coarse-to-Control: Action-Token Planning for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-05T10:01:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Coarse-to-Control adds planning via coarse action tokens in the same vocabulary as control actions, improving VLA performance on long-horizon manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29360","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MiraBench: Evaluating Action-Conditioned Reliability in Robotic World Models","primary_cat":"cs.AI","submitted_at":"2026-05-28T04:58:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MiraBench defines action-conditioned reliability via three levels (physics adherence, action-following fidelity, optimism bias detection) and applies it to 12 model configurations using a 16,000-judgment human corpus, finding visual fidelity a poor proxy for action fidelity, no reliable scale benefi","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00113","ref_index":137,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"World Models for Robotic Manipulation: A Survey","primary_cat":"cs.RO","submitted_at":"2026-05-27T05:32:17+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Survey organizing world models for robotic manipulation into representation families, a functional taxonomy, and infrastructure roles across pretraining, post-training, and inference, while reviewing 34 datasets and evaluation protocols.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27284","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FineVLA: Fine-Grained Instruction Alignment for Steerable Vision-Language-Action Policies","primary_cat":"cs.RO","submitted_at":"2026-05-26T17:01:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FineVLA unifies robot datasets into 47k fine-grained trajectories, adds a VLM annotator and benchmark, and shows that mixing fine-grained and goal-level instructions improves steerable control without hurting task success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23847","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Instrumentation for Imitation Learning: Enhancing Training Datasets for Clothes Hanger Insertion","primary_cat":"cs.RO","submitted_at":"2026-05-22T16:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Instrumented objects boost diffusion policy success in robotic hanger insertion by 14-25 percentage points over vision-only baselines, and augmenting datasets with instrumented expert rollouts lets a vision-only student match the instrumented expert.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17070","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EPIC-Bench: A Perception-Centric Benchmark for Fine-Grained Embodied Visual Grounding in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-16T16:38:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"EPIC-Bench is a new fine-grained benchmark that shows leading VLMs struggle with multi-target counting, part-whole relations, and affordance detection in real-world embodied visual grounding tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12369","ref_index":24,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GuidedVLA: Specifying Task-Relevant Factors via Plug-and-Play Action Attention Specialization","primary_cat":"cs.RO","submitted_at":"2026-05-12T16:38:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GuidedVLA improves VLA generalization by supervising individual attention heads with manually defined auxiliary signals for three task-relevant factors.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"language-action models with stereo vision.arXiv preprint arXiv:2512.21970, 2025. [23] Danny Driess, Fei Xia, Mehdi SM Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al. Palm-e: An embodied multimodal language model. In International Conference on Machine Learning, pages 8469-8488. PMLR, 2023. [24] Frederik Ebert et al. Bridgedata v2: A dataset for robot learning at scale.arXiv preprint arXiv:2308.12952, 2023. [25] Cunxin Fan, Xiaosong Jia, Yihang Sun, Yixiao Wang, Jianglan Wei, Ziyang Gong, Xiangyu Zhao, Masayoshi Tomizuka, Xue Yang, Junchi Yan, et al. Interleave-vla: Enhancing robot manipulation with interleaved image- text instructions. InICLR, 2026."},{"citing_arxiv_id":"2605.12090","ref_index":128,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"World Action Models: The Next Frontier in Embodied AI","primary_cat":"cs.RO","submitted_at":"2026-05-12T13:10:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper introduces World Action Models as a new paradigm unifying predictive world modeling with action generation in embodied foundation models and provides a taxonomy of existing approaches.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Motus [19], Act2Goal [108], PhysGen [22], GigaWorld-Policy [109], UD-VLA [110], X-W AM [111] Training data Robot-centric Teleoperation QT-Opt [112], MIME [ 113], RoboNet [114], Robo T urk-Real [115], BridgeData [116], MT-Opt [117] BC-Z [118], RT-1 [119], Language-Table [120], BridgeData v2 [ 121], Jaco Play [ 122] Cable Routing Dataset [ 123], RH20T [124], OXE [125], DROID [126], RH20T-P [127], RoboMIND [128] ARIO [129], RoboData [130], DexCap [131], FuSe [132], AgiBot World [133], REASSEMBLE [ 134] OmniAction [135], UnifoLM-WBT [136] UMI-style Human Demonstration UMI [137], FastUMI [138], FastUMI-100K [139], RealOmin [140], Hoi! [ 141], RDT2 [142] ActiveUMI [143], exUMI [ 144], Tactile-Conditioned Diffusion Policy [145], DexUMI [ 146] UMI on Legs [ 147], HoMMI [ 148], MV-UMI [149]"},{"citing_arxiv_id":"2605.11479","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Offline Policy Evaluation for Manipulation Policies via Discounted Liveness Formulation","primary_cat":"cs.RO","submitted_at":"2026-05-12T03:54:30+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A liveness-based Bellman operator enables conservative offline policy evaluation for manipulation tasks by encoding task progression and reducing truncation bias from finite horizons.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"B[ ˜Vπ](s)−B[ ˜V ′ π](s) ≤κ|| ˜Vπ − ˜V ′ π||∞. Takes∈ S, and let us denote the immediate next state ofswiths ′. Also take ˜Vπ and ˜V ′ π. In Eq. 34, we make use of the identity|min{a, b} −min{a, c}| ≤ |b−c|. The full derivation is as follows. B[ ˜Vπ](s)−B[ ˜V ′ π](s) = \u0018\u0018\u0018\u0018(1−γ) +γmin{l(s), ˜Vπ(s′)} − \u0010 \u0018\u0018\u0018\u0018(1−γ) +γmin{l(s), ˜V ′ π(s′)} \u0011 (33) =γ min{l(s), ˜Vπ(s′)} −min{l(s), ˜V ′ π(s′)} (34) ≤γ ˜Vπ(s′)− ˜V ′ π(s′) (35) Since B[ ˜Vπ](s)−B[ ˜V ′ π](s) ≤γ ˜Vπ(s′)− ˜V ′ π(s′) holds∀s∈ S, we have B[ ˜Vπ](s)−B[ ˜V ′ π](s) ≤γ|| ˜Vπ − ˜V ′ π||∞ (36) Since the discount factorγis chosen in the open interval of(0,1), it serves as the contraction constantκwe look for in this"},{"citing_arxiv_id":"2605.06481","ref_index":75,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OA-WAM: Object-Addressable World Action Model for Robust Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-07T16:06:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OA-WAM uses persistent address vectors and dynamic content vectors in object slots to enable addressable world-action prediction, improving robustness on manipulation benchmarks under scene changes.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"language-action model with latent world model.arXiv preprint arXiv:2602.10098, 2026. 13 [74] Khoa V o, Taisei Hanyu, Yuki Ikebe, Trong Thang Pham, Nhat Chung, Minh Nhat Vu, Duy Ho Minh Nguyen, Anh Nguyen, Anthony Gunderman, Chase Rainwater, and Ngan Le. Clutter- robust Vision-Language-Action models through object-centric and geometry grounding.arXiv preprint arXiv:2512.22519, 2025. [75] Homer Walke, Kevin Black, Abraham Lee, Moo Jin Kim, Max Du, Chongyi Zheng, Tony Zhao, Philippe Hansen-Estruch, Quan Vuong, Andre He, Vivek Myers, Kuan Fang, Chelsea Finn, and Sergey Levine. BridgeData V2: A dataset for robot learning at scale. InConference on Robot Learning (CoRL), 2023. arXiv:2308.12952. [76] Guodong Wang, Chenkai Zhang, Qingjie Liu, Jinjin Zhang, Jiancheng Cai, Junjie Liu, and Xinmin Liu."},{"citing_arxiv_id":"2605.06311","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Toward Visually Realistic Simulation: A Benchmark for Evaluating Robot Manipulation in Simulation","primary_cat":"cs.RO","submitted_at":"2026-05-07T14:13:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VISER is a new visually realistic simulation benchmark for robot manipulation tasks that uses PBR materials and MLLM-assisted asset generation, achieving 0.92 Pearson correlation with real-world policy performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"to real-world evaluation. A wide range of simulation benchmarks [ 19, 21, 46, 24, 48] have been proposed to facilitate evaluation. Most simulated benchmarks are built on MuJoCo [ 36], which provides plausible physical simulation and fast rasterization, but they lack visual fidelity. Similarly, current robotics datasets, such as RoboTwin [ 26, 5] and ManiTwin [ 39], usually utilize assets generated by 3D generation models, which, despite their scalability, lack the high-fidelity textures necessary for real-world representation. This visual discrepancy makes these evaluation frameworks unreliable; for instance, the policy may fail in the real-world environment with complex materials and illumination, even when it behaves perfectly in simulation environments."},{"citing_arxiv_id":"2512.15692","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"mimic-video: Video-Action Models for Generalizable Robot Control Beyond VLAs","primary_cat":"cs.RO","submitted_at":"2025-12-17T18:47:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"mimic-video combines internet video pretraining with a flow-matching decoder to achieve state-of-the-art robotic manipulation performance with 10x better sample efficiency than vision-language-action models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.03233","ref_index":83,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GraspVLA: a Grasping Foundation Model Pre-trained on Billion-scale Synthetic Action Data","primary_cat":"cs.RO","submitted_at":"2025-05-06T06:59:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GraspVLA shows that pretraining a grasping model on a billion synthetic action frames enables zero-shot open-vocabulary performance and sim-to-real transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.06114","ref_index":111,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning Interactive Real-World Simulators","primary_cat":"cs.AI","submitted_at":"2023-10-09T19:42:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniSim learns a universal real-world simulator from orchestrated diverse datasets, enabling zero-shot deployment of policies trained purely in simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}