{"total":25,"items":[{"citing_arxiv_id":"2605.13775","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RoboEvolve: Co-Evolving Planner-Simulator for Robotic Manipulation with Limited Data","primary_cat":"cs.RO","submitted_at":"2026-05-13T16:54:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A co-evolutionary VLM-VGM loop on 500 unlabeled images raises planner success by 30 points and simulator success by 48 percent while beating fully supervised baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12624","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MindVLA-U1: VLA Beats VA with Unified Streaming Architecture for Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-05-12T18:09:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MindVLA-U1 introduces a unified streaming VLA with shared backbone, framewise memory, and language-guided action diffusion that surpasses human drivers on WOD-E2E planning metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11809","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond World-Frame Action Heads: Motion-Centric Action Frames for Vision-Language-Action Models","primary_cat":"cs.AI","submitted_at":"2026-05-12T09:03:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MCF-Proto adds a motion-centric local action frame and prototype parameterization to VLA models, inducing emergent geometric structure and improved robustness from standard demonstrations alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10942","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HarmoWAM: Harmonizing Generalizable and Precise Manipulation via Adaptive World Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-11T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HarmoWAM unifies predictive and reactive control in world action models via an adaptive gating mechanism to deliver improved zero-shot generalization and precision in robotic manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10819","ref_index":50,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ALAM: Algebraically Consistent Latent Action Model for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-11T16:37:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ALAM introduces algebraic consistency regularization on latent action transitions from videos, raising VLA success rates from 47.9% to 85.0% on MetaWorld MT50 and 94.1% to 98.1% on LIBERO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07794","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"NoiseGate: Learning Per-Latent Timestep Schedules as Information Gating in World Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-08T14:31:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NoiseGate learns per-latent timestep schedules as an information-gating policy in diffusion-based world action models, yielding consistent gains on RoboTwin manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00078","ref_index":89,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Being-H0.7: A Latent World-Action Model from Egocentric Videos","primary_cat":"cs.RO","submitted_at":"2026-04-30T14:16:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Being-H0.7 adds future-aware latent reasoning to direct VLA policies via dual-branch alignment on latent queries, matching world-model benefits at VLA efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22615","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GazeVLA: Learning Human Intention for Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-04-24T14:46:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GazeVLA pretrains on large human egocentric datasets to capture gaze-based intention, then finetunes on limited robot data with chain-of-thought reasoning to achieve better robotic manipulation performance than baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21241","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CorridorVLA: Explicit Spatial Constraints for Generative Action Heads via Sparse Anchors","primary_cat":"cs.RO","submitted_at":"2026-04-23T03:17:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CorridorVLA improves VLA models by using predicted sparse anchors to impose explicit spatial corridors on action trajectories, yielding 3.4-12.4% success rate gains on LIBERO-Plus with GR00T-Corr reaching 83.21%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17887","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StableIDM: Stabilizing Inverse Dynamics Model against Manipulator Truncation via Spatio-Temporal Refinement","primary_cat":"cs.RO","submitted_at":"2026-04-20T06:57:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StableIDM stabilizes inverse dynamics models under manipulator truncation by combining robot-centric masking, directional spatial feature aggregation, and temporal dynamics refinement, yielding 12.1% higher strict action accuracy on AgiBot and 9.7-17.6% gains in real-robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17862","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"M100: An Orchestrated Dataflow Architecture Powering General AI Computing","primary_cat":"cs.LG","submitted_at":"2026-04-20T06:19:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"M100 is a tensor-based dataflow architecture that eliminates heavy caching through compiler-managed data streams, claiming higher utilization and better performance than GPGPUs for AD and LLM inference tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16592","ref_index":185,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Human Cognition in Machines: A Unified Perspective of World Models","primary_cat":"cs.RO","submitted_at":"2026-04-17T17:51:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces a unified framework for world models that fully incorporates all cognitive functions from Cognitive Architecture Theory, highlights under-researched areas in motivation and meta-cognition, and proposes Epistemic World Models as a new category for scientific discovery agents.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"✗ ✓ ✗ ✓ ✓ ✗ ✗Physics-informed World Model jointly learning video, depth, and keypoint dynamics EnerVerse-AC [68] 2025 Robot.✓ ✓ ✗ ✗ ✓ ✗ ✗Chunk-wise autoregressive video diffusion with sparse memory and 4DGS for action-conditioned prediction UWM [232] 2025 Robot.✓ ✗ ✗ ✗ ✓ ✗ ✗Couples video and action dif- fusion in one transformer; pretrained on video-only and video+action data GR-1 [185] 2024 Robot.✓ ✓ ✓ ✗ ✓ ✗ ✗GPT transformer pretrained on 800K Ego4D clips jointly pre- dicting actions and future frames GR-2 [28] 2024 Robot.✓ ✓ ✓ ✗ ✓ ✗ ✗Scaled video-language-action model (719M) achieving 97.7% success across 100+ real tasks UniPi [44] 2023 Robot.✗ ✗ ✓ ✗ ✓ ✗ ✗Text-conditioned video diffusion as policy; extracts actions via in- verse dynamics"},{"citing_arxiv_id":"2604.15281","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"R3D: Revisiting 3D Policy Learning","primary_cat":"cs.CV","submitted_at":"2026-04-16T17:50:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A transformer 3D encoder plus diffusion decoder architecture, with 3D-specific augmentations, outperforms prior 3D policy methods on manipulation benchmarks by improving training stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14125","ref_index":39,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HiVLA: A Visual-Grounded-Centric Hierarchical Embodied Manipulation System","primary_cat":"cs.CV","submitted_at":"2026-04-15T17:50:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HiVLA decouples VLM-based semantic planning with visual grounding from a cascaded cross-attention DiT action expert, outperforming end-to-end VLAs on long-horizon and fine-grained manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11386","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ComSim: Building Scalable Real-World Robot Data Generation via Compositional Simulation","primary_cat":"cs.RO","submitted_at":"2026-04-13T12:25:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Compositional Simulation generates scalable real-world robot training data by combining classical simulation with neural simulation in a closed-loop real-sim-real augmentation pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09330","ref_index":72,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VAG: Dual-Stream Video-Action Generation for Embodied Data Synthesis","primary_cat":"cs.RO","submitted_at":"2026-04-10T13:59:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VAG is a synchronized dual-stream flow-matching framework that generates aligned video-action pairs for synthetic embodied data synthesis and policy pretraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08168","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ViVa: A Video-Generative Value Model for Robot Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-04-09T12:28:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ViVa turns a video generator into a value model for robot RL that jointly forecasts future states and task value, yielding better performance on real-world box assembly when integrated with RECAP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03181","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-View Video Diffusion Policy: A 3D Spatio-Temporal-Aware Video Action Model","primary_cat":"cs.RO","submitted_at":"2026-04-03T16:57:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MV-VDP jointly predicts multi-view RGB and heatmap videos via diffusion to achieve data-efficient, robust robotic manipulation policies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.21539","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WorldVLA: Towards Autoregressive Action World Model","primary_cat":"cs.RO","submitted_at":"2025-06-26T17:55:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WorldVLA unifies VLA and world models in one autoregressive system, shows they boost each other, and adds an attention mask to stop error buildup when generating action chunks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.09985","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning","primary_cat":"cs.AI","submitted_at":"2025-06-11T17:57:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"V-JEPA 2 pre-trained on massive unlabeled video achieves strong results on motion understanding and action anticipation, SOTA video QA at 8B scale, and enables zero-shot robotic planning on Franka arms using only 62 hours of unlabeled robot video.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.02792","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unified World Models: Coupling Video and Action Diffusion for Pretraining on Large Robotic Datasets","primary_cat":"cs.RO","submitted_at":"2025-04-03T17:38:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unified World Models couple video and action diffusion inside one transformer with independent timesteps, enabling pretraining on heterogeneous robot datasets that include action-free video and producing more generalizable policies than imitation learning alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.14734","ref_index":92,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","primary_cat":"cs.RO","submitted_at":"2025-03-18T21:06:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GR00T N1 is a new open VLA foundation model for humanoid robots that outperforms imitation learning baselines in simulation and shows strong performance on real-world bimanual manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.14803","ref_index":126,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Video Prediction Policy: A Generalist Robot Policy with Predictive Visual Representations","primary_cat":"cs.CV","submitted_at":"2024-12-19T12:48:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video Prediction Policy conditions robot action learning on future-frame predictions inside fine-tuned video diffusion models, yielding 18.6% relative gains on Calvin ABC-D and 31.6% higher real-world success rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.19650","ref_index":68,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2024-11-29T12:06:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CogACT is a new VLA model that uses a conditioned diffusion action transformer to achieve over 35% higher average success rates than OpenVLA in simulation and 55% in real-robot experiments while generalizing to new robots and objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.06158","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2024-10-08T16:00:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GR-2 pre-trains on web-scale videos then fine-tunes on robot data to reach 97.7% average success across over 100 manipulation tasks with strong generalization to new scenes and objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}