{"total":338,"items":[{"citing_arxiv_id":"2607.02092","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Guided Action Flow: Q-Guided Inference for Flow-Matching Vision-Language-Action Policies","primary_cat":"cs.RO","submitted_at":"2026-07-02T12:30:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Guided Action Flow applies a rollout-trained critic to steer frozen flow-matching VLA policies at inference time via action gradients, reporting success rate gains on LIBERO manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01586","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLAFlow: A Unified Training Framework for Vision-Language-Action Models via Co-training and Future Latent Alignment","primary_cat":"cs.CV","submitted_at":"2026-07-02T01:38:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLAFlow shows that combining language-supervised co-training with future latent alignment produces the most stable transfer performance for vision-language-action models trained on mixed robot data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01060","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RoboWorld: Fast and Reliable Neural Simulators for Generalist Robot Policy Evaluation","primary_cat":"cs.RO","submitted_at":"2026-07-01T15:22:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoboWorld introduces an automated pipeline using autoregressive video world models and task-progress VLM scoring, plus Step Forcing for long-horizon stability, to achieve high correlation with real robot policy evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00678","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ABot-M0.5: Unified Mobility-and-Manipulation World Action Model","primary_cat":"cs.CV","submitted_at":"2026-07-01T09:21:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ABot-M0.5 proposes a unified mobility-and-manipulation world action model using three alignment strategies that achieves state-of-the-art performance on mobile and fine-grained manipulation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00351","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unleashing More Actions via Action Compositional Training for VLA Models","primary_cat":"cs.RO","submitted_at":"2026-07-01T02:48:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ACT-VLA synthesizes novel demonstrations from existing VLA tasks via latent representations to reduce overfitting and improve generalization on manipulation tasks in simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00168","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Data Sharing and Competition in Learning-by-Deploying Industries: Insights from Robotics and Beyond","primary_cat":"cs.GT","submitted_at":"2026-06-30T20:44:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In a two-period game-theoretic model of learning-by-deploying, data pooling raises welfare with fixed prices but can turn privately unprofitable under Cournot competition, with a sustainability threshold set by demand elasticity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32009","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Human-as-Humanoid: Enabling Zero-Shot Humanoid Learning from Ego-Exo Human Videos with Human-Aligned Embodiments","primary_cat":"cs.RO","submitted_at":"2026-06-30T17:44:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Human-as-Humanoid converts ego-exo human videos into executable 60-DoF humanoid actions through embodiment alignment and retargeting, enabling zero-shot real-robot policy deployment without target-task teleoperation data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31958","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adapting Generalist Robot Policies with Semantic Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-06-30T17:00:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SARL optimizes language prompt inputs to generalist vision-language-action policies through online RL to solve complex long-horizon tasks by composing existing skills.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31682","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HABIT: Human-Aware Behavior and Interaction Training Dataset for Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-30T13:58:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"HABIT is a large-scale robot demonstration dataset for human-present environments that elicits spatiotemporal synchronization, yielding, and gesture grounding behaviors absent from robot-only training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30456","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vision-Language-Action Models: Experimental Insights from a Real-World UR5 Platform","primary_cat":"cs.RO","submitted_at":"2026-06-29T15:23:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Real-robot trials with OpenVLA on a UR5e arm show consistent offline-to-closed-loop gaps driven by action semantics, coordinate conventions, temporal alignment, image preprocessing, and dataset quality rather than model capacity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30113","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SA-VLA: State-aware tokenizer for improving Vision-Language-Action Models' performance","primary_cat":"cs.RO","submitted_at":"2026-06-29T10:45:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SA-VLA adds state conditioning to VQ-based action tokenization in VLA policies, expanding each discrete token's effective support to state-dependent actions and raising average success rates from 0.29 to 0.56 on 12 sim tasks and 0.15 to 0.33 on 3 real tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30111","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Automating the Design of Embodied AgentArchitectures","primary_cat":"cs.RO","submitted_at":"2026-06-29T10:45:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Automated architecture search for embodied agents produces directional success-rate gains on vision-language and manipulation tasks while exposing limits from simulation noise and incomplete credit assignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29892","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Trust Your Instincts: Confidence-Driven Test-Time RL for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-29T07:31:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"T^2VLA is a test-time reinforcement learning framework for VLAs that uses internal confidence to define intrinsic rewards via similarity to high-confidence expert demonstrations and a dual-expert bootstrapping mechanism.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29774","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Analytic Concept-Centric Memory for Agentic Embodied Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-29T04:33:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes a structured concept-centric memory system for embodied agents that connects object, scene, transition, and skill memories to support coarse-to-fine retrieval and improve task performance over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29699","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Early Warning Signals for OpenVLA Failure under Visual Distribution Shift","primary_cat":"cs.CV","submitted_at":"2026-06-29T02:07:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"OpenVLA layer-16 activations allow a logistic probe to predict failure within 15 steps under occlusion (AUROC 0.972) better than baselines, with some transfer to camera jitter.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29517","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CORE: Common Outcome Regularities from Action-Free Visual Demonstrations for Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-28T17:27:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CORE extracts visual goal prototypes from terminal embeddings in action-free demonstrations to condition robot policies, reporting success rate gains of up to 17 percentage points on manipulation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29384","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Event-VLA: Action-Conditioned Event Fusion for Robust Vision-Language-Action Model","primary_cat":"cs.CV","submitted_at":"2026-06-28T13:19:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Event-VLA integrates event streams into VLA models through action-conditioned gated cross-attention to maintain performance in normal light while improving success rates under low-light and near-dark conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29201","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Behavior Uncloning: Distilling Mode Redirection into Policy Weights without Inference-Time Steering","primary_cat":"cs.RO","submitted_at":"2026-06-28T05:01:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MoRE improves robot policy success rates by 44 percentage points by distilling mode redirection into weights, matching filtered retraining performance without inference overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29148","ref_index":171,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GPC: Large-Scale Generative Pretraining for Transferable Motor Control","primary_cat":"cs.CV","submitted_at":"2026-06-28T02:05:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GPC learns a motion vocabulary via Finite Scalar Quantization and end-to-end RL, then trains an autoregressive transformer for next-token control generation, achieving 99.98% motion reproduction success with emergent robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29089","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TAP-VLA: Tactile Annotation Prompting for Vision Language Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-27T21:06:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TAP-VLA improves VLA performance in contact-rich manipulation by visually annotating tactile shear fields onto input images, reaching 78% success versus under 50% for vision-only and other tactile methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28529","ref_index":5,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Speedup Paradox: Rethinking Inference Speed-Quality Trade-off in Embodied Tasks","primary_cat":"cs.RO","submitted_at":"2026-06-26T18:28:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TISED decomposes inference optimization effects on embodied tasks and identifies paradoxical outcomes where faster per-step inference can increase task completion time on static tasks or raise success rates on dynamic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28276","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SimFoundry: Modular and Automated Scene Generation for Policy Learning and Evaluation","primary_cat":"cs.RO","submitted_at":"2026-06-26T17:18:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SimFoundry automates zero-shot real-to-sim scene generation from video, producing digital twins and cousins that enable policy training with 0.911 mean Pearson correlation to real-world results and 17-40% success gains from variations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28192","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PA-BiCoop: A Primary-Auxiliary Cooperative Framework for General Bimanual Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-26T15:38:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PA-BiCoop introduces a single-model bimanual framework with primary-auxiliary arm differentiation, specialized decoders, and dynamic role assignment that reports 48% average gains on RLBench2 and over 50% in real-world tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28152","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Regularized Reward-Punishment Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-26T14:50:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces KCPR and its deep form klDMP that couples reward and punishment policies via learned priors, yielding improved safety and stability in grid-world and Gazebo navigation tasks over DQN, SQL and softDMP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27872","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S$^2$-VLA: State-Space Guided Vision-Language-Action Models for Long-Horizon Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-26T09:13:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"S²-VLA uses a state-space model to maintain a belief state that produces dynamic gating weights for fusing visual, language, and action features, claiming better long-horizon manipulation than 7B models with only 2B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27295","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LA4VLA: Learning to Act without Seeing via Language-Action Pretraining","primary_cat":"cs.RO","submitted_at":"2026-06-25T17:13:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LA4VLA creates a 33K language-action dataset from existing demos and shows that pretraining on language-action pairs before or alongside vision-language-action training boosts success rates in sim and real robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27079","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ForesightSafety-VLA: A Unified Diagnostic Safety Benchmark for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-25T14:19:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ForesightSafety-VLA creates a diagnostic benchmark for VLA safety with taxonomy across physical, language, and visual risks, showing perception and structure variations cause more safety degradation than language changes in tested models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26800","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SSI-Policy: Learning Structured Scene Interfaces for Vision-Language Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-25T09:38:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SSI-Policy uses an RGB-only Structured Scene Interface to improve LIBERO benchmark performance by nearly 15% with only 10 demonstrations per task compared to prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23686","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LIBERO-Safety: A Comprehensive Benchmark for Physical and Semantic Safety in Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-22T17:59:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LIBERO-Safety supplies a scalable benchmark, data-generation pipeline, and 19,664-demonstration dataset that exposes a generalization-safety tension in current VLA models where diverse training improves collision avoidance but task success stays limited by trajectory quality and semantic understandi","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23531","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BiliVLA: Scene-Aware Vision-Language-Action Model with Reinforcement Learning for Autonomous Biliary Endoscopic Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-22T16:11:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BiliVLA applies scene-aware VLA with grounding-enhanced SFT and GRPO to achieve 91.96% action precision and 84.85% success rate across three ERCP subtasks in phantom experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22540","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PolicyTrim: Boosting Intrinsic Policy Efficiency of Vision-Language-Action Models","primary_cat":"cs.CV","submitted_at":"2026-06-21T14:54:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PolicyTrim is an RL post-training framework that boosts VLA policy efficiency by 3x chunk utilization and 51.4% fewer steps, yielding up to 5.83x speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22449","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Evolving Cognitive Framework via Causal World Modeling for Embodied Scientific Intelligence","primary_cat":"cs.AI","submitted_at":"2026-06-21T11:46:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes a self-evolving cognitive framework integrating causal world modeling, intervention-driven reasoning, and continual refinement for embodied scientific intelligence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22113","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KITE: Decoupling Kinematics and Interaction for Zero-Shot Cross-Embodiment Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-20T15:51:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"KITE decouples task reasoning from embodiment-specific control via learned latent interaction intents to enable zero-shot transfer across structurally different robots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21088","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MV-WAM: Manifold-Aware World Action Model with Value Augmentation","primary_cat":"cs.RO","submitted_at":"2026-06-19T04:35:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MV-WAM reports 55.7% simulation and 77.5% real-world success rates by aligning heterogeneous visual and action manifolds through causal masking and value-guided rollback.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20871","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geometric Entropy: When Trajectory Diversity Helps and Hurts in Imitation Learning","primary_cat":"cs.RO","submitted_at":"2026-06-18T19:02:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Geometric diversity of demonstration trajectories exhibits an inverted-U effect on imitation learning success, with the peak shifting lower as mastery increases via more data, easier tasks, or stronger priors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20867","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FOCA: Future-Oriented Conditioning for Data-Efficient Vision-Language-Action Adaptation","primary_cat":"cs.CV","submitted_at":"2026-06-18T18:54:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FOCA improves few-shot VLA adaptation by explicitly predicting future interaction embeddings and implicitly aligning to goal observations, yielding up to 26% gains on real robots with only 20 demonstrations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20521","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HumanScale: Egocentric Human Video Can Outperform Real-Robot Data for Embodied Pretraining","primary_cat":"cs.CV","submitted_at":"2026-06-18T17:37:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Processed egocentric human video outperforms teleoperated real-robot trajectories as pretraining data for embodied foundation models, delivering 24% lower validation loss and 52.5-90% higher task success rates under matched post-training protocols.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20458","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Slow Brain, Fast Planner: Latency-Resilient VLM-Augmented Urban Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-18T16:40:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A training-free fusion layer enables stale VLM selections to improve a real-time planner's trajectory scoring for urban sidewalk navigation, yielding 30% ADE reduction in challenging scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20135","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Frequency-Aware Flow Matching for Continuous and Consistent Robotic Action Generation","primary_cat":"cs.RO","submitted_at":"2026-06-18T11:58:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FAFM performs flow matching in the frequency domain using DCT on action sequences to produce continuous temporally consistent robotic actions with a Sobolev-style smoothness regularizer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19998","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tri-Info: Generalizable, Interpretable Failure Prediction for VLA Models via Information Theory","primary_cat":"cs.RO","submitted_at":"2026-06-18T09:34:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tri-Info uses three information theory signals on action diversity, temporal consistency, and state coupling to predict VLA model failures with cross-domain generalization to 83% real-world accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19980","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ENPIRE: Agentic Robot Policy Self-Improvement in the Real World","primary_cat":"cs.AI","submitted_at":"2026-06-18T09:21:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ENPIRE supplies four modules (Environment, Policy Improvement, Rollout, Evolution) that turn real-world robot training into an autonomous optimization loop driven by coding agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19784","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EquiVLA: A General Framework for Rotationally Equivariant Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-18T04:36:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EquiVLA is the first general framework for end-to-end SO(2)-equivariant VLA models using EquiPerceptor and EquiActor modules, reporting improved success rates on LIBERO, CALVIN, and real-robot benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19340","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ZeroDex: Zero-Shot Long-Horizon Dexterous Manipulation via Multi-View 3D-Grounded VLM Reasoning","primary_cat":"cs.RO","submitted_at":"2026-06-17T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ZeroDex grounds VLM outputs into 3D keypoints via multi-view triangulation and ray voting to enable zero-shot long-horizon dexterous manipulation with closed-loop replanning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18953","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Object-Centric Residual RL for Zero-Shot Sim-to-Real VLA Enhancement","primary_cat":"cs.RO","submitted_at":"2026-06-17T11:36:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Object-centric residual RL trained in simulation with pose noise and dropout raises real Franka robot VLA success from 42% to 76% zero-shot across five tasks, with improved data reusable for base model retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18315","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ghost Attractor Networks: Basin-Structured Dynamical Decoders for Closed-Loop Sequential Generation","primary_cat":"cs.LG","submitted_at":"2026-06-16T11:23:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Ghost Attractor Networks are theoretically derived dynamical decoders that impose basin-attractor geometry on latent space via potential-drift dynamics, enabling efficient multi-modal sequential generation and closed-loop control.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17511","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MagicSim: A Unified Infrastructure for Executable Embodied Interaction","primary_cat":"cs.RO","submitted_at":"2026-06-16T04:42:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MagicSim is a unified embodied interaction infrastructure built on a deterministic batched runtime and shared MDP that supports diverse world construction, execution, task evaluation, automatic rollout generation, and interactive agent interfaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17256","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Contrastive Action-Image Pre-training for Visuomotor Control","primary_cat":"cs.RO","submitted_at":"2026-06-15T20:00:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAIP learns action-aligned visual representations via contrastive pre-training on human hand keypoints from egocentric video, outperforming DINOv2, SigLIP, MVP, and R3M with >30% gains on real dexterous manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.16776","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"JoyAI-Sim: A Simulation-Enabled Interconversion Toolchain for the Embodied Data Pyramid","primary_cat":"cs.RO","submitted_at":"2026-06-15T14:21:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"JoyAI-Sim provides bidirectional Robot-Simulation-Human pathways for aligned model evaluation and data generation in robotics using the JoySim simulator as an evaluation layer and physical consistency filter.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.15148","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MimicIK: Real-Time Generative Inverse Kinematics from Teleoperation with FK Consistency","primary_cat":"cs.RO","submitted_at":"2026-06-13T06:32:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MimicIK applies conditional flow matching with an FK consistency loss and MIP backbone to learn IK from 8848 teleoperation demos on a 6-DOF robot, reporting 4.65 mm mean position error, 92.01% 10 mm success rate, 7.99% spike rate, and 6.74 ms latency while remaining stable near singularities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.14551","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRACE: Trajectory-Routed Causal Memory for Delayed-Evidence Visuomotor Imitation","primary_cat":"cs.RO","submitted_at":"2026-06-12T15:30:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRACE attaches a trajectory-signature-indexed latent memory to existing visuomotor imitation policies to recover context for delayed-evidence branch points in long-horizon robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}