{"work":{"id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","openalex_id":null,"doi":null,"arxiv_id":"2212.06817","raw_key":null,"title":"RT-1: Robotics Transformer for Real-World Control at Scale","authors":null,"authors_text":"Anthony Brohan, Noah Brown, Justice Carbajal, Yevgen Chebotar, Joseph Dabis, Chelsea Finn","year":2022,"venue":"cs.RO","abstract":"By transferring knowledge from large, diverse, task-agnostic datasets, modern machine learning models can solve specific downstream tasks either zero-shot or with small task-specific datasets to a high level of performance. While this capability has been demonstrated in other fields such as computer vision, natural language processing or speech recognition, it remains to be shown in robotics, where the generalization capabilities of the models are particularly critical due to the difficulty of collecting real-world robotic data. We argue that one of the keys to the success of such general robotic models lies with open-ended task-agnostic training, combined with high-capacity architectures that can absorb all of the diverse, robotic data. In this paper, we present a model class, dubbed Robotics Transformer, that exhibits promising scalable model properties. We verify our conclusions in a study of different model classes and their ability to generalize as a function of the data size, model size, and data diversity based on a large-scale data collection on real robots performing real-world tasks. The project's website and videos can be found at robotics-transformer1.github.io","external_url":"https://arxiv.org/abs/2212.06817","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-27T22:01:20.702036+00:00","pith_arxiv_id":"2212.06817","created_at":"2026-05-09T06:35:38.629691+00:00","updated_at":"2026-06-27T22:01:20.702036+00:00","title_quality_ok":true,"display_title":"RT-1: Robotics Transformer for Real-World Control at Scale","render_title":"RT-1: Robotics Transformer for Real-World Control at Scale"},"hub":{"state":{"work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":213,"external_cited_by_count":null,"distinct_field_count":10,"first_pith_cited_at":"2023-02-22T18:47:51+00:00","last_pith_cited_at":"2026-06-08T13:50:31+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-28T07:46:58.980169+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":78},{"context_role":"baseline","n":5},{"context_role":"dataset","n":4},{"context_role":"method","n":2},{"context_role":"other","n":1}],"polarity_counts":[{"context_polarity":"background","n":76},{"context_polarity":"baseline","n":6},{"context_polarity":"unclear","n":3},{"context_polarity":"use_dataset","n":3},{"context_polarity":"use_method","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","claims":[{"claim_text":"By transferring knowledge from large, diverse, task-agnostic datasets, modern machine learning models can solve specific downstream tasks either zero-shot or with small task-specific datasets to a high level of performance. While this capability has been demonstrated in other fields such as computer vision, natural language processing or speech recognition, it remains to be shown in robotics, where the generalization capabilities of the models are particularly critical due to the difficulty of collecting real-world robotic data. We argue that one of the keys to the success of such general robo","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RT-1: Robotics Transformer for Real-World Control at Scale because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:35:27.495630+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"9e0bdc26-0bc7-4e92-93ac-9b2a7f12c819","orcid":null,"display_name":"Anthony Brohan"},{"id":"657aa3e6-af8f-48bd-aeb3-c63cc3bedc60","orcid":null,"display_name":"Noah Brown"},{"id":"322522a9-d4c8-4ec9-9e57-fea78173ea02","orcid":null,"display_name":"Justice Carbajal"},{"id":"dbfa9c9c-12ca-4878-9d7d-10db6dc427ed","orcid":null,"display_name":"Yevgen Chebotar"},{"id":"8f473aff-1ea8-4a10-86ec-122dde6225ad","orcid":null,"display_name":"Joseph Dabis"},{"id":"379e406e-0cbc-4ede-b9dd-9a76a16a6da8","orcid":null,"display_name":"Chelsea Finn"}]},"error":null,"updated_at":"2026-05-14T02:35:13.509476+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T02:35:17.185093+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":62},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":56},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":43},{"title":"Octo: An Open-Source Generalist Robot Policy","work_id":"f9ca0722-8855-48c3-a27a-0eefb7e19253","shared_citers":33},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":33},{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","work_id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","shared_citers":30},{"title":"Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware","work_id":"6fe159e0-fa73-481a-88d4-4719c15140be","shared_citers":29},{"title":"Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success","work_id":"04f46bb3-4346-47e8-bf09-c75d91f96e87","shared_citers":28},{"title":"DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset","work_id":"13253de2-3d89-415c-8c2f-3adb25d4c337","shared_citers":24},{"title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","work_id":"83a8f966-6cfa-4f21-81f3-87440aae238f","shared_citers":24},{"title":"Open X-Embodiment: Robotic Learning Datasets and RT-X Models","work_id":"62f0fb6c-e6ae-4dc4-95a4-d9dd64b240e8","shared_citers":22},{"title":"RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation","work_id":"12319725-bc7d-4c32-a229-ad270a7460bc","shared_citers":21},{"title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","work_id":"4b158d3e-3dff-4412-85cd-baa879465a5e","shared_citers":19},{"title":"PaLM-E: An Embodied Multimodal Language Model","work_id":"5b99811a-1d93-47e2-9d59-f4045a0b74a2","shared_citers":19},{"title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","work_id":"843ab5eb-2815-4db8-b3bc-890b23fa5ffa","shared_citers":17},{"title":"RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation","work_id":"9b985126-4a2f-4bdf-b014-2a7524ec634e","shared_citers":17},{"title":"SpatialVLA: Exploring Spatial Representations for Visual-Language-Action Model","work_id":"592041b3-3ca2-4836-8dd4-f8095d8a692b","shared_citers":17},{"title":"Do As I Can, Not As I Say: Grounding Language in Robotic Affordances","work_id":"037320f1-b0a9-4cbe-a639-bfb25409ce71","shared_citers":16},{"title":"3D-VLA: A 3D Vision-Language-Action Generative World Model","work_id":"aebf924c-e761-437e-9cee-f1ccc2e427bd","shared_citers":15},{"title":"Bridge Data: Boosting Generalization of Robotic Skills with Cross-Domain Datasets","work_id":"59e728c0-b6ca-4759-a8f4-02b981f2220f","shared_citers":15},{"title":"LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning","work_id":"662203ad-084f-42c4-8e60-977b3173755b","shared_citers":15},{"title":"LIBERO-Plus: In-depth Robustness Analysis of Vision-Language-Action Models","work_id":"e35c8c6d-977d-4af1-963a-766ba98703ce","shared_citers":14},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":13},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":13}],"time_series":[{"n":4,"year":2023},{"n":10,"year":2024},{"n":5,"year":2025},{"n":85,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T02:35:28.129269+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T02:35:24.082829+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","claims":[{"claim_text":"By transferring knowledge from large, diverse, task-agnostic datasets, modern machine learning models can solve specific downstream tasks either zero-shot or with small task-specific datasets to a high level of performance. While this capability has been demonstrated in other fields such as computer vision, natural language processing or speech recognition, it remains to be shown in robotics, where the generalization capabilities of the models are particularly critical due to the difficulty of collecting real-world robotic data. We argue that one of the keys to the success of such general robo","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RT-1: Robotics Transformer for Real-World Control at Scale because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:35:28.014838+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","claims":[{"claim_text":"By transferring knowledge from large, diverse, task-agnostic datasets, modern machine learning models can solve specific downstream tasks either zero-shot or with small task-specific datasets to a high level of performance. While this capability has been demonstrated in other fields such as computer vision, natural language processing or speech recognition, it remains to be shown in robotics, where the generalization capabilities of the models are particularly critical due to the difficulty of collecting real-world robotic data. We argue that one of the keys to the success of such general robo","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RT-1: Robotics Transformer for Real-World Control at Scale because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:35:17.188752+00:00"}},"summary":{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","claims":[{"claim_text":"By transferring knowledge from large, diverse, task-agnostic datasets, modern machine learning models can solve specific downstream tasks either zero-shot or with small task-specific datasets to a high level of performance. While this capability has been demonstrated in other fields such as computer vision, natural language processing or speech recognition, it remains to be shown in robotics, where the generalization capabilities of the models are particularly critical due to the difficulty of collecting real-world robotic data. We argue that one of the keys to the success of such general robo","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks RT-1: Robotics Transformer for Real-World Control at Scale because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":62},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":56},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":43},{"title":"Octo: An Open-Source Generalist Robot Policy","work_id":"f9ca0722-8855-48c3-a27a-0eefb7e19253","shared_citers":33},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":33},{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","work_id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","shared_citers":30},{"title":"Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware","work_id":"6fe159e0-fa73-481a-88d4-4719c15140be","shared_citers":29},{"title":"Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success","work_id":"04f46bb3-4346-47e8-bf09-c75d91f96e87","shared_citers":28},{"title":"DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset","work_id":"13253de2-3d89-415c-8c2f-3adb25d4c337","shared_citers":24},{"title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","work_id":"83a8f966-6cfa-4f21-81f3-87440aae238f","shared_citers":24},{"title":"Open X-Embodiment: Robotic Learning Datasets and RT-X Models","work_id":"62f0fb6c-e6ae-4dc4-95a4-d9dd64b240e8","shared_citers":22},{"title":"RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation","work_id":"12319725-bc7d-4c32-a229-ad270a7460bc","shared_citers":21},{"title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","work_id":"4b158d3e-3dff-4412-85cd-baa879465a5e","shared_citers":19},{"title":"PaLM-E: An Embodied Multimodal Language Model","work_id":"5b99811a-1d93-47e2-9d59-f4045a0b74a2","shared_citers":19},{"title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","work_id":"843ab5eb-2815-4db8-b3bc-890b23fa5ffa","shared_citers":17},{"title":"RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation","work_id":"9b985126-4a2f-4bdf-b014-2a7524ec634e","shared_citers":17},{"title":"SpatialVLA: Exploring Spatial Representations for Visual-Language-Action Model","work_id":"592041b3-3ca2-4836-8dd4-f8095d8a692b","shared_citers":17},{"title":"Do As I Can, Not As I Say: Grounding Language in Robotic Affordances","work_id":"037320f1-b0a9-4cbe-a639-bfb25409ce71","shared_citers":16},{"title":"3D-VLA: A 3D Vision-Language-Action Generative World Model","work_id":"aebf924c-e761-437e-9cee-f1ccc2e427bd","shared_citers":15},{"title":"Bridge Data: Boosting Generalization of Robotic Skills with Cross-Domain Datasets","work_id":"59e728c0-b6ca-4759-a8f4-02b981f2220f","shared_citers":15},{"title":"LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning","work_id":"662203ad-084f-42c4-8e60-977b3173755b","shared_citers":15},{"title":"LIBERO-Plus: In-depth Robustness Analysis of Vision-Language-Action Models","work_id":"e35c8c6d-977d-4af1-963a-766ba98703ce","shared_citers":14},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":13},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":13}],"time_series":[{"n":4,"year":2023},{"n":10,"year":2024},{"n":5,"year":2025},{"n":85,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"9e0bdc26-0bc7-4e92-93ac-9b2a7f12c819","orcid":null,"display_name":"Anthony Brohan","source":"manual","import_confidence":0.72},{"id":"379e406e-0cbc-4ede-b9dd-9a76a16a6da8","orcid":null,"display_name":"Chelsea Finn","source":"manual","import_confidence":0.72},{"id":"8f473aff-1ea8-4a10-86ec-122dde6225ad","orcid":null,"display_name":"Joseph Dabis","source":"manual","import_confidence":0.72},{"id":"322522a9-d4c8-4ec9-9e57-fea78173ea02","orcid":null,"display_name":"Justice Carbajal","source":"manual","import_confidence":0.72},{"id":"657aa3e6-af8f-48bd-aeb3-c63cc3bedc60","orcid":null,"display_name":"Noah Brown","source":"manual","import_confidence":0.72},{"id":"dbfa9c9c-12ca-4878-9d7d-10db6dc427ed","orcid":null,"display_name":"Yevgen Chebotar","source":"manual","import_confidence":0.72}]}}