{"work":{"id":"13faca8d-e96d-4e6c-a441-9f2683d11934","openalex_id":null,"doi":null,"arxiv_id":"2510.10274","raw_key":null,"title":"X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model","authors":null,"authors_text":"Jinliang Zheng, Jianxiong Li, Zhihao Wang, Dongxiu Liu, Xirui Kang, Yuchun Feng","year":2025,"venue":"cs.RO","abstract":"Successful generalist Vision-Language-Action (VLA) models rely on effective training across diverse robotic platforms with large-scale, cross-embodiment, heterogeneous datasets. To facilitate and leverage the heterogeneity in rich, diverse robotic data sources, we propose a novel Soft Prompt approach with minimally added parameters, by infusing prompt learning concepts into cross-embodiment robot learning and introducing separate sets of learnable embeddings for each distinct data source. These embeddings serve as embodiment-specific prompts, which in unity empower VLA models with effective exploitation of varying cross-embodiment features. Our new X-VLA, a neat flow-matching-based VLA architecture, relies exclusively on soft-prompted standard Transformer encoders, enjoying both scalability and simplicity. Evaluated across 6 simulations as well as 3 real-world robots, our 0.9B instantiation-X-VLA-0.9B simultaneously achieves SOTA performance over a sweep of benchmarks, demonstrating superior results on a wide axes of capabilities, from flexible dexterity to quick adaptation across embodiments, environments, and tasks. Website: https://thu-air-dream.github.io/X-VLA/","external_url":"https://arxiv.org/abs/2510.10274","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-21T14:10:13.367217+00:00","pith_arxiv_id":"2510.10274","created_at":"2026-05-09T06:05:34.919328+00:00","updated_at":"2026-05-21T14:10:13.367217+00:00","title_quality_ok":true,"display_title":"X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model","render_title":"X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model"},"hub":{"state":{"work_id":"13faca8d-e96d-4e6c-a441-9f2683d11934","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":43,"external_cited_by_count":null,"distinct_field_count":4,"first_pith_cited_at":"2025-11-04T17:59:12+00:00","last_pith_cited_at":"2026-05-20T17:10:31+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-04T04:27:15.658240+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":13},{"context_role":"baseline","n":7}],"polarity_counts":[{"context_polarity":"background","n":12},{"context_polarity":"baseline","n":7},{"context_polarity":"unclear","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}