{"total":51,"items":[{"citing_arxiv_id":"2605.13119","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Long-horizon Embodied Agents with Tool-Aligned Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-13T07:40:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLAs-as-Tools pairs a VLM planner with specialized VLA executors via a new interface and Tool-Aligned Post-Training to raise long-horizon robot success rates on LIBERO-Long and RoboTwin benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12416","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Aligning Flow Map Policies with Optimal Q-Guidance","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:12:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Flow map policies enable fast one-step inference for flow-based RL policies, and FMQ provides an optimal closed-form Q-guided target for offline-to-online adaptation under trust-region constraints, achieving SOTA performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12334","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcing VLAs in Task-Agnostic World Models","primary_cat":"cs.AI","submitted_at":"2026-05-12T16:16:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RAW-Dream lets VLAs learn new tasks in zero-shot imagination by using a world model pre-trained only on task-free behaviors and an unmodified VLM to supply rewards, with dual-noise verification to limit hallucinations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12090","ref_index":132,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World Action Models: The Next Frontier in Embodied AI","primary_cat":"cs.RO","submitted_at":"2026-05-12T13:10:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper introduces World Action Models as a new paradigm unifying predictive world modeling with action generation in embodied foundation models and provides a taxonomy of existing approaches.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Robot-centric Teleoperation QT-Opt [112], MIME [ 113], RoboNet [114], Robo T urk-Real [115], BridgeData [116], MT-Opt [117] BC-Z [118], RT-1 [119], Language-Table [120], BridgeData v2 [ 121], Jaco Play [ 122] Cable Routing Dataset [ 123], RH20T [124], OXE [125], DROID [126], RH20T-P [127], RoboMIND [128] ARIO [129], RoboData [130], DexCap [131], FuSe [132], AgiBot World [133], REASSEMBLE [ 134] OmniAction [135], UnifoLM-WBT [136] UMI-style Human Demonstration UMI [137], FastUMI [138], FastUMI-100K [139], RealOmin [140], Hoi! [ 141], RDT2 [142] ActiveUMI [143], exUMI [ 144], Tactile-Conditioned Diffusion Policy [145], DexUMI [ 146] UMI on Legs [ 147], HoMMI [ 148], MV-UMI [149] Simulation Data MimicGen [150], ManiSkill2 [ 151], RoboCasa [152], Robo T win [153], DexMimicGen [ 154]"},{"citing_arxiv_id":"2605.11567","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Execution Commitment of Vision-Language-Action Models","primary_cat":"cs.CV","submitted_at":"2026-05-12T05:52:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A3 determines the execution horizon in VLA models as the longest prefix of actions that passes consensus-based verification and sequential consistency checks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09613","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SABER: A Scalable Action-Based Embodied Dataset for Real-World VLA Adaptation","primary_cat":"cs.RO","submitted_at":"2026-05-10T15:51:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SABER provides 44.8K multi-representation action samples from unscripted retail environments that raise a VLA model's mean success rate on ten manipulation tasks from 13.4% to 29.3%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08774","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProcVLM: Learning Procedure-Grounded Progress Rewards for Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-09T08:00:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ProcVLM learns procedure-grounded dense progress rewards for robotic manipulation via a reasoning-before-estimation VLM trained on a 60M-frame synthesized corpus from 30 embodied datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06481","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OA-WAM: Object-Addressable World Action Model for Robust Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-07T16:06:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OA-WAM uses persistent address vectors and dynamic content vectors in object slots to enable addressable world-action prediction, improving robustness on manipulation benchmarks under scene changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06747","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HumanNet: Scaling Human-centric Video Learning to One Million Hours","primary_cat":"cs.CV","submitted_at":"2026-05-07T15:21:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HumanNet is a 1M-hour human-centric video dataset with interaction annotations that enables better vision-language-action model performance than equivalent robot data in a controlled test.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06311","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Visually Realistic Simulation: A Benchmark for Evaluating Robot Manipulation in Simulation","primary_cat":"cs.RO","submitted_at":"2026-05-07T14:13:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VISER is a new visually realistic simulation benchmark for robot manipulation tasks that uses PBR materials and MLLM-assisted asset generation, achieving 0.92 Pearson correlation with real-world policy performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04647","ref_index":55,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReflectDrive-2: Reinforcement-Learning-Aligned Self-Editing for Discrete Diffusion Driving","primary_cat":"cs.RO","submitted_at":"2026-05-06T08:52:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReflectDrive-2 combines masked discrete diffusion with RL-aligned self-editing to generate and refine driving trajectories, reaching 91.0 PDMS on NAVSIM camera-only and 94.8 in best-of-6.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01544","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"An Efficient Metric for Data Quality Measurement in Imitation Learning","primary_cat":"cs.RO","submitted_at":"2026-05-02T17:16:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Power spectral density of trajectories ranks demonstration quality for imitation learning, enabling rollout-free curation that improves fine-tuned policy success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01477","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Action Agent: Agentic Video Generation Meets Flow-Constrained Diffusion","primary_cat":"cs.RO","submitted_at":"2026-05-02T14:52:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Action Agent pairs LLM-driven video generation with a flow-constrained diffusion transformer to produce velocity commands, raising video success to 86% and delivering 64.7% real-world navigation on a Unitree G1 humanoid.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00397","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MiniVLA-Nav v1: A Multi-Scene Simulation Dataset for Language-Conditioned Robot Navigation","primary_cat":"cs.RO","submitted_at":"2026-05-01T04:36:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MiniVLA-Nav v1 provides 1,174 episodes of language-instructed robot navigation in photorealistic simulations with RGB, depth, segmentation, and expert action data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00244","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Lucid-XR: An Extended-Reality Data Engine for Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-04-30T21:25:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lucid-XR uses XR-headset physics simulation and physics-guided video generation to create synthetic data that trains robot policies transferring zero-shot to unseen real-world manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.28197","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OmniRobotHome: A Multi-Camera Platform for Real-Time Multiadic Human-Robot Interaction","primary_cat":"cs.RO","submitted_at":"2026-04-30T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A 48-camera residential platform delivers real-time occlusion-robust 3D perception and coordinated actuation for multi-human multi-robot interaction in a shared home workspace.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27472","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PRTS: A Primitive Reasoning and Tasking System via Contrastive Representations","primary_cat":"cs.AI","submitted_at":"2026-04-30T06:14:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRTS pretrains VLA models with contrastive goal-conditioned RL to embed goal-reachability probabilities from offline data, yielding SOTA results on robotic benchmarks especially for long-horizon and novel instructions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26689","ref_index":5,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Atomic-Probe Governance for Skill Updates in Compositional Robot Policies","primary_cat":"cs.RO","submitted_at":"2026-04-29T13:56:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A cross-version swap protocol reveals dominant skills that swing composition success by up to 50 percentage points, and an atomic probe with selective revalidation governs updates at lower cost than always re-testing full compositions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24182","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$M^2$-VLA: Boosting Vision-Language Models for Generalizable Manipulation via Layer Mixture and Meta-Skills","primary_cat":"cs.RO","submitted_at":"2026-04-27T08:44:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"M²-VLA shows that generalized VLMs can serve as direct backbones for robotic manipulation by selectively extracting task-critical features via Mixture of Layers and adding Meta Skill Modules for efficient trajectory learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23001","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vision-Language-Action in Robotics: A Survey of Datasets, Benchmarks, and Data Engines","primary_cat":"cs.RO","submitted_at":"2026-04-24T20:41:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey of VLA robotics research identifies data infrastructure as the primary bottleneck and distills four open challenges in representation alignment, multimodal supervision, reasoning assessment, and scalable data generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22551","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"QDTraj: Exploration of Diverse Trajectory Primitives for Articulated Objects Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-04-24T13:45:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QDTraj uses Quality-Diversity algorithms with sparse rewards to produce at least five times more diverse high-performing trajectories for articulated object manipulation than compared methods, validated across 30 objects with hundreds of trajectories per task.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22227","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Co-Evolutionary Theory of Human-AI Coexistence: Mutualism, Governance, and Dynamics in Complex Societies","primary_cat":"cs.CY","submitted_at":"2026-04-24T05:02:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Human-AI coexistence is best modeled as conditional mutualism under governance, formalized as a multiplex dynamical system whose simulations show stable high-coexistence equilibria only under balanced institutional oversight.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20100","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"JoyAI-RA 0.1: A Foundation Model for Robotic Autonomy","primary_cat":"cs.RO","submitted_at":"2026-04-22T01:51:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"JoyAI-RA is a multi-source pretrained VLA model that claims to bridge human-to-robot embodiment gaps via data unification and outperforms prior methods on generalization-heavy robotic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19728","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VLA Foundry: A Unified Framework for Training Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-04-21T17:51:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VLA Foundry provides a single training stack for VLA models and releases open models that match prior closed-source performance or outperform baselines on multi-task manipulation in simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17800","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ReFineVLA: Multimodal Reasoning-Aware Generalist Robotic Policies via Teacher-Guided Fine-Tuning","primary_cat":"cs.RO","submitted_at":"2026-04-20T04:46:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReFineVLA adds teacher-generated reasoning steps to VLA training and reports state-of-the-art success rates on SimplerEnv WidowX and Google Robot benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15483","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"${\\pi}_{0.7}$: a Steerable Generalist Robotic Foundation Model with Emergent Capabilities","primary_cat":"cs.LG","submitted_at":"2026-04-16T19:18:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"π₀.₇ is a steerable generalist robotic model that uses rich multimodal prompts including language, subgoal images, and performance metadata to achieve out-of-the-box generalization across tasks and robot bodies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13733","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Jump-Start Reinforcement Learning with Vision-Language-Action Regularization","primary_cat":"cs.LG","submitted_at":"2026-04-15T11:17:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VLAJS augments PPO with sparse annealed VLA guidance through directional regularization to cut required interactions by over 50% on manipulation tasks and enable zero-shot sim-to-real transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13001","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"XRZero-G0: Pushing the Frontier of Dexterous Robotic Manipulation with Interfaces, Quality and Ratios","primary_cat":"cs.RO","submitted_at":"2026-04-14T17:34:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"XRZero-G0 enables 2000-hour robot-free datasets that, when mixed 10:1 with real-robot data, match full real-robot performance at 1/20th the cost and support zero-shot transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11174","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"EmbodiedGovBench: A Benchmark for Governance, Recovery, and Upgrade Safety in Embodied Agent Systems","primary_cat":"cs.RO","submitted_at":"2026-04-13T08:34:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EmbodiedGovBench is a new benchmark framework that measures embodied agent systems on seven governance dimensions including policy adherence, recovery success, and upgrade safety.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10809","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WARPED: Wrist-Aligned Rendering for Robot Policy Learning from Egocentric Human Demonstrations","primary_cat":"cs.RO","submitted_at":"2026-04-12T20:40:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WARPED synthesizes realistic wrist-view observations from monocular egocentric human videos via foundation models, hand-object tracking, retargeting, and Gaussian Splatting to train visuomotor policies that match teleoperation success rates on five tabletop tasks with 5-8x less collection effort.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10333","ref_index":112,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Zero-shot World Models Are Developmentally Efficient Learners","primary_cat":"cs.AI","submitted_at":"2026-04-11T19:32:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A zero-shot visual world model trained on one child's experience achieves broad competence on physical understanding benchmarks while matching developmental behavioral patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08544","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SIM1: Physics-Aligned Simulator as Zero-Shot Data Scaler in Deformable Worlds","primary_cat":"cs.RO","submitted_at":"2026-04-09T17:59:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIM1 converts sparse real demonstrations into high-fidelity synthetic data through physics-aligned simulation, yielding policies that match real-data performance at a 1:15 ratio with 90% zero-shot success on deformable manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08059","ref_index":19,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Governed Capability Evolution: Lifecycle-Time Compatibility Checking and Rollback for AI-Component-Based Systems, with Embodied Agents as Case Study","primary_cat":"cs.RO","submitted_at":"2026-04-09T10:18:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A governed capability evolution framework with interface, policy, behavioral, and recovery checks reduces unsafe activations to zero in embodied agent upgrades while preserving task success rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03781","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OpenRC: An Open-Source Robotic Colonoscopy Framework for Multimodal Data Acquisition and Autonomy Research","primary_cat":"cs.RO","submitted_at":"2026-04-04T16:07:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OpenRC is an open-source robotic colonoscopy platform with hardware retrofit and a multimodal dataset of nearly 1,900 episodes for autonomy and VLA research.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.13778","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"InternVLA-M1: A Spatially Guided Vision-Language-Action Framework for Generalist Robot Policy","primary_cat":"cs.RO","submitted_at":"2025-10-15T17:30:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVLA-M1 uses spatially guided pre-training on 2.3M examples followed by action post-training to deliver up to 17% gains on robot manipulation benchmarks and 20.6% on unseen objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.02792","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unified World Models: Coupling Video and Action Diffusion for Pretraining on Large Robotic Datasets","primary_cat":"cs.RO","submitted_at":"2025-04-03T17:38:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unified World Models couple video and action diffusion inside one transformer with independent timesteps, enabling pretraining on heterogeneous robot datasets that include action-free video and producing more generalizable policies than imitation learning alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.19645","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success","primary_cat":"cs.RO","submitted_at":"2025-02-27T00:30:29+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OpenVLA-OFT fine-tuning boosts LIBERO success rate from 76.5% to 97.1%, speeds action generation 26x, and outperforms baselines on real bimanual dexterous tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.05855","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DexVLA: Vision-Language Model with Plug-In Diffusion Expert for General Robot Control","primary_cat":"cs.RO","submitted_at":"2025-02-09T11:25:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DexVLA combines a scaled diffusion action expert with embodiment curriculum learning to achieve better generalization and performance than prior VLA models on diverse robot hardware and long-horizon tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.09747","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2025-01-16T18:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FAST applies discrete cosine transform to robot action sequences for efficient tokenization, enabling autoregressive VLAs to succeed on high-frequency dexterous tasks and scale to 10k hours of data while matching diffusion VLA performance with up to 5x faster training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.14803","ref_index":116,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video Prediction Policy: A Generalist Robot Policy with Predictive Visual Representations","primary_cat":"cs.CV","submitted_at":"2024-12-19T12:48:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video Prediction Policy conditions robot action learning on future-frame predictions inside fine-tuned video diffusion models, yielding 18.6% relative gains on Calvin ABC-D and 31.6% higher real-world success rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.19650","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2024-11-29T12:06:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CogACT is a new VLA model that uses a conditioned diffusion action transformer to achieve over 35% higher average success rates than OpenVLA in simulation and 55% in real-robot experiments while generalizing to new robots and objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.24164","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","primary_cat":"cs.LG","submitted_at":"2024-10-31T17:22:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"π₀ is a vision-language-action flow model trained on diverse multi-platform robot data that supports zero-shot task performance, language instruction following, and efficient fine-tuning for dexterous tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.06158","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2024-10-08T16:00:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GR-2 pre-trains on web-scale videos then fine-tunes on robot data to reach 97.7% average success across over 100 manipulation tasks with strong generalization to new scenes and objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.09246","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OpenVLA: An Open-Source Vision-Language-Action Model","primary_cat":"cs.RO","submitted_at":"2024-06-13T15:46:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OpenVLA achieves 16.5% higher task success than the 55B RT-2-X model across 29 tasks with 7x fewer parameters while enabling effective fine-tuning and quantization without performance loss.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Closed-Loop Robot Control Policy User: Wipe the table. OpenVLA: [x, , Grip] = …ΔΔθΔ Multi-Robot Control & Eﬃcient Fine-Tuning Large-Scale Robot Training Data Fully Data Weights Code Open-Source Figure 1: We present OpenVLA, a 7B-parameter open-source vision-language-action model (VLA), trained on 970k robot episodes from the Open X-Embodiment dataset [ 1]. OpenVLA sets a new state of the art for generalist robot manipulation policies. It supports controlling multiple robots out of the box and can be quickly adapted to new robot domains via parameter-efficient fine-tuning. The OpenVLA checkpoints and PyTorch training pipeline are fully open-source and models can be downloaded and fine-tuned from HuggingFace."},{"citing_arxiv_id":"2406.02523","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots","primary_cat":"cs.RO","submitted_at":"2024-06-04T17:41:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoboCasa supplies a large-scale kitchen simulator, generative assets, 100 tasks, and automated data pipelines that produce a clear scaling trend in imitation learning for generalist robots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.12213","ref_index":68,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Octo: An Open-Source Generalist Robot Policy","primary_cat":"cs.RO","submitted_at":"2024-05-20T17:57:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Octo is an open-source transformer-based generalist robot policy pretrained on 800k trajectories that serves as an effective initialization for finetuning across diverse robotic platforms.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ghosh, homer_walke, pertsch, kvablack, oier.mees}@berkeley.edu arXiv:2405.12213v2 [cs.RO] 26 May 2024 experience from other robots and tasks offers a possible solution, exposing models to a diverse set of robotic control problems that may improve generalization and performance on downstream tasks. However, even as general-purpose models become ubiquitous in natural language [ 68, 88]) and computer vision [76, 44], it has proven challenging to build the analogous \"general-purpose robot model\" that can control many robots for many tasks. Training a unified control policy in robotics presents unique challenges, requiring handling different robot embodiments, sensor setups, action spaces, task specifications, environments, and compute budgets."},{"citing_arxiv_id":"2405.05941","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evaluating Real-World Robot Manipulation Policies in Simulation","primary_cat":"cs.RO","submitted_at":"2024-05-09T17:30:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIMPLER simulated environments yield policy performance that correlates strongly with real-world robot manipulation results and captures similar sensitivity to distribution shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.12945","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset","primary_cat":"cs.RO","submitted_at":"2024-03-19T17:48:38+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DROID is a new 76k-trajectory in-the-wild robot manipulation dataset spanning 564 scenes and 84 tasks that improves policy performance and generalization when used for training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.09631","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"3D-VLA: A 3D Vision-Language-Action Generative World Model","primary_cat":"cs.CV","submitted_at":"2024-03-14T17:58:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"3D-VLA is a new embodied foundation model that uses a 3D LLM plus aligned diffusion models to generate future images and point clouds for improved reasoning and action planning in 3D environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.12289","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DriveVLM: The Convergence of Autonomous Driving and Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2024-02-19T17:04:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DriveVLM adds vision-language models with scene description, analysis, and hierarchical planning modules to autonomous driving, paired with a hybrid DriveVLM-Dual system tested on nuScenes and SUP-AD datasets and deployed on a production vehicle.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}