{"total":19,"items":[{"citing_arxiv_id":"2605.13276","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"D-VLA: A High-Concurrency Distributed Asynchronous Reinforcement Learning Framework for Vision-Language-Action Models","primary_cat":"cs.AI","submitted_at":"2026-05-13T09:54:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"D-VLA introduces plane decoupling and a swimlane asynchronous pipeline to achieve high-concurrency RL training and linear scalability for billion- to trillion-parameter vision-language-action models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13080","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning to See What You Need: Gaze Attention for Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-13T06:54:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Gaze Attention groups visual embeddings into selectable regions and dynamically restricts attention to task-relevant ones, matching dense baselines with up to 90% fewer visual KV entries via added context tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12334","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reinforcing VLAs in Task-Agnostic World Models","primary_cat":"cs.AI","submitted_at":"2026-05-12T16:16:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RAW-Dream lets VLAs learn new tasks in zero-shot imagination by using a world model pre-trained only on task-free behaviors and an unmodified VLM to supply rewards, with dual-noise verification to limit hallucinations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11151","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RankQ: Offline-to-Online Reinforcement Learning via Self-Supervised Action Ranking","primary_cat":"cs.AI","submitted_at":"2026-05-11T18:58:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RankQ adds a self-supervised ranking loss to Q-learning to learn structured action orderings, yielding competitive or better performance than prior methods on D4RL benchmarks and large gains in vision-based robot fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10821","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Unified Noise Steering for Efficient Human-Guided VLA Adaptation","primary_cat":"cs.RO","submitted_at":"2026-05-11T16:37:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniSteer unifies human corrective actions and noise-space RL for VLA adaptation by inverting actions to noise targets, raising success rates from 20% to 90% in 66 minutes across four real-world manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07794","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NoiseGate: Learning Per-Latent Timestep Schedules as Information Gating in World Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-08T14:31:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NoiseGate learns per-latent timestep schedules as an information-gating policy in diffusion-based world action models, yielding consistent gains on RoboTwin manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00416","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning while Deploying: Fleet-Scale Reinforcement Learning for Generalist Robot Policies","primary_cat":"cs.RO","submitted_at":"2026-05-01T05:20:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fleet-scale RL framework improves a single generalist VLA policy from deployment data to 95% average success on eight real-world manipulation tasks with 16 dual-arm robots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.28192","ref_index":24,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LaST-R1: Reinforcing Robotic Manipulation via Adaptive Physical Latent Reasoning","primary_cat":"cs.RO","submitted_at":"2026-04-30T17:59:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LaST-R1 introduces a RL post-training method called LAPO that optimizes latent Chain-of-Thought reasoning in vision-language-action models, yielding 99.9% success on LIBERO and up to 22.5% real-world gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25459","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GS-Playground: A High-Throughput Photorealistic Simulator for Vision-Informed Robot Learning","primary_cat":"cs.RO","submitted_at":"2026-04-28T10:05:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GS-Playground delivers a high-throughput photorealistic simulator for vision-informed robot learning via parallel physics integrated with batch 3D Gaussian Splatting at 10^4 FPS and an automated Real2Sim workflow for consistent environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24086","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AsyncShield: A Plug-and-Play Edge Adapter for Asynchronous Cloud-based VLA Navigation","primary_cat":"cs.RO","submitted_at":"2026-04-27T06:20:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AsyncShield restores VLA geometric intent from latency via kinematic pose mapping and uses PPO-Lagrangian to balance tracking with LiDAR safety constraints in a plug-and-play module.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23121","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Breaking Lock-In: Preserving Steerability under Low-Data VLA Post-Training","primary_cat":"cs.RO","submitted_at":"2026-04-25T03:18:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeLock mitigates lock-in in low-data VLA post-training via visual grounding preservation and test-time contrastive prompt guidance, outperforming baselines across eight evaluations while matching data-heavy generalist policies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23073","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RL Token: Bootstrapping Online RL with Vision-Language-Action Models","primary_cat":"cs.LG","submitted_at":"2026-04-24T23:57:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RL Token enables sample-efficient online RL fine-tuning of large VLAs, delivering up to 3x speed gains and higher success rates on real-robot manipulation tasks within minutes to hours.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21241","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CorridorVLA: Explicit Spatial Constraints for Generative Action Heads via Sparse Anchors","primary_cat":"cs.RO","submitted_at":"2026-04-23T03:17:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CorridorVLA improves VLA models by using predicted sparse anchors to impose explicit spatial corridors on action trajectories, yielding 3.4-12.4% success rate gains on LIBERO-Plus with GR00T-Corr reaching 83.21%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18107","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Test-Time Perturbation Learning with Delayed Feedback for Vision-Language-Action Models","primary_cat":"cs.CV","submitted_at":"2026-04-20T11:25:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PDF improves VLA success rates on LIBERO and Atari by applying test-time perturbation learning with delayed feedback to correct trajectory overfitting and overconfidence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17880","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ST-$\\pi$: Structured SpatioTemporal VLA for Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-04-20T06:48:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ST-π structures VLA models by having a spatiotemporal VLM produce causally ordered chunk-level prompts that guide a dual-generator action expert to jointly handle spatial and temporal control in robotic manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00015","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TimeRFT: Stimulating Generalizable Time Series Forecasting for TSFMs via Reinforcement Finetuning","primary_cat":"eess.SP","submitted_at":"2026-04-18T06:22:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TimeRFT applies reinforcement learning with multi-faceted step-wise rewards and informative sample selection to improve generalization and accuracy in TSFM adaptation beyond supervised fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13733","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Jump-Start Reinforcement Learning with Vision-Language-Action Regularization","primary_cat":"cs.LG","submitted_at":"2026-04-15T11:17:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VLAJS augments PPO with sparse annealed VLA guidance through directional regularization to cut required interactions by over 50% on manipulation tasks and enable zero-shot sim-to-real transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.21998","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Causal World Modeling for Robot Control","primary_cat":"cs.CV","submitted_at":"2026-01-29T17:07:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LingBot-VA combines video world modeling with policy learning via Mixture-of-Transformers, closed-loop rollouts, and asynchronous inference to improve robot manipulation in simulation and real settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.14759","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"$\\pi^{*}_{0.6}$: a VLA That Learns From Experience","primary_cat":"cs.LG","submitted_at":"2025-11-18T18:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RECAP enables a generalist VLA to self-improve via advantage-conditioned RL on mixed real-world data, more than doubling throughput and halving failure rates on hard manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}