{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:MVQLCYDAV5XYQOSFXCJZJTZ4P5","short_pith_number":"pith:MVQLCYDA","schema_version":"1.0","canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","source":{"kind":"arxiv","id":"2502.19417","version":2},"attestation_state":"computed","paper":{"title":"Hi Robot: Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.RO","authors_text":"Adrian Li-Bell, Anna Walling, Brian Ichter, Chelsea Finn, Danny Driess, Haohuan Wang, James Tanner, Karl Pertsch, Lachy Groom, Liyiming Ke, Lucy Xiaoyang Shi, Michael Equi, Niccolo Fusai, Quan Vuong, Sergey Levine","submitted_at":"2025-02-26T18:58:41Z","abstract_excerpt":"Generalist robots that can perform a range of different tasks in open-world settings must be able to not only reason about the steps needed to accomplish their goals, but also process complex instructions, prompts, and even feedback during task execution. Intricate instructions (e.g., \"Could you make me a vegetarian sandwich?\" or \"I don't like that one\") require not just the ability to physically perform the individual steps, but the ability to situate complex commands and feedback in the physical world. In this work, we describe a system that uses vision-language models in a hierarchical stru"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2502.19417","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-02-26T18:58:41Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"3b1fdea721df6a4839273c19af265454d6171db77f88a82f2f1d4d419a24a8a0","abstract_canon_sha256":"2b64350ff6a13afc04f9ab60c2db11011409494aafe124453dddad25a14cfe73"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.872916Z","signature_b64":"5ov1jo/6hx8sNEsXK1RwGpHG4T72r7bVtAWm4UUR+H0Csh1P+g59n30fGDWUc9X34YMljJ49F8qAPg2mJIuJCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","last_reissued_at":"2026-05-17T23:38:49.872422Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.872422Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Hi Robot: Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.RO","authors_text":"Adrian Li-Bell, Anna Walling, Brian Ichter, Chelsea Finn, Danny Driess, Haohuan Wang, James Tanner, Karl Pertsch, Lachy Groom, Liyiming Ke, Lucy Xiaoyang Shi, Michael Equi, Niccolo Fusai, Quan Vuong, Sergey Levine","submitted_at":"2025-02-26T18:58:41Z","abstract_excerpt":"Generalist robots that can perform a range of different tasks in open-world settings must be able to not only reason about the steps needed to accomplish their goals, but also process complex instructions, prompts, and even feedback during task execution. Intricate instructions (e.g., \"Could you make me a vegetarian sandwich?\" or \"I don't like that one\") require not just the ability to physically perform the individual steps, but the ability to situate complex commands and feedback in the physical world. In this work, we describe a system that uses vision-language models in a hierarchical stru"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a911d2a90d0654a3b68453cba48345a116270656d7efdc74e4788fb3186b2344"},"source":{"id":"2502.19417","kind":"arxiv","version":2},"verdict":{"id":"b6905406-1385-4c55-88d8-097e0df14877","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:49:39.001717Z","strongest_claim":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')","one_line_summary":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context.","pith_extraction_headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps."},"references":{"count":51,"sample":[{"doi":"","year":2024,"title":"RT-H: Action Hierarchies Using Language","work_id":"ecf7cf18-c1a8-4a6b-bc2a-fb165643aa0d","ref_index":1,"cited_arxiv_id":"2403.01823","is_internal_anchor":true},{"doi":"","year":2024,"title":"PaliGemma: A versatile 3B VLM for transfer","work_id":"df6f48b3-5792-47c7-9614-cb856ea31ad9","ref_index":2,"cited_arxiv_id":"2407.07726","is_internal_anchor":true},{"doi":"","year":2024,"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","ref_index":3,"cited_arxiv_id":"2410.24164","is_internal_anchor":true},{"doi":"","year":2022,"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","ref_index":4,"cited_arxiv_id":"2212.06817","is_internal_anchor":true},{"doi":"","year":2023,"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","ref_index":5,"cited_arxiv_id":"2307.15818","is_internal_anchor":true}],"resolved_work":51,"snapshot_sha256":"35b3c23e920f2dff07e8dada247e4fa541fc8b0e2a40cdbe3cf143930ac69cc7","internal_anchors":15},"formal_canon":{"evidence_count":1,"snapshot_sha256":"1ab0a28f36eda47538f2f25b3a78edc1a5d12c7e26d9cdde158c42a79a1a0378"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.19417","created_at":"2026-05-17T23:38:49.872505+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.19417v2","created_at":"2026-05-17T23:38:49.872505+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.19417","created_at":"2026-05-17T23:38:49.872505+00:00"},{"alias_kind":"pith_short_12","alias_value":"MVQLCYDAV5XY","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"MVQLCYDAV5XYQOSF","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"MVQLCYDA","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":39,"internal_anchor_count":39,"sample":[{"citing_arxiv_id":"2606.03784","citing_title":"Revisiting Embodied Chain-of-Thought for Generalizable Robot Manipulation","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2606.02735","citing_title":"See Less, Specify More: Visual Evidence Budgets for Generalizable VLAs","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.30877","citing_title":"Wall-OSS-0.5 Technical Report","ref_index":83,"is_internal_anchor":true},{"citing_arxiv_id":"2606.31958","citing_title":"Adapting Generalist Robot Policies with Semantic Reinforcement Learning","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2606.07100","citing_title":"LARA: Latent Action Representation Alignment for Vision-Language-Action Models","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.25813","citing_title":"Extending Embodied Question Answering from Perception to Decision","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2606.27295","citing_title":"LA4VLA: Learning to Act without Seeing via Language-Action Pretraining","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2503.03480","citing_title":"SafeVLA: Towards Safety Alignment of Vision-Language-Action Model via Constrained Learning","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2504.16054","citing_title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","ref_index":72,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22183","citing_title":"Action with Visual Primitives","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22812","citing_title":"GesVLA: Gesture-Aware Vision-Language-Action Model Embedded Representations","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22816","citing_title":"AwareVLN: Reasoning with Self-awareness for Vision-Language Navigation","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2603.14371","citing_title":"OxyGen: Unified KV Cache Management for VLA Inference under Multi-Task Parallelism","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17486","citing_title":"DyGRO-VLA: Cross-Task Scaling of Vision-Language-Action Models via Dynamic Grouped Residual Optimization","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17522","citing_title":"RoboFlow4D: A Lightweight Flow World Model Toward Real-Time Flow-Guided Robotic Manipulation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2507.16815","citing_title":"ThinkAct: Vision-Language-Action Reasoning via Reinforced Visual Latent Planning","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2507.01925","citing_title":"A Survey on Vision-Language-Action Models: An Action Tokenization Perspective","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2507.04447","citing_title":"DreamVLA: A Vision-Language-Action Model Dreamed with Comprehensive World Knowledge","ref_index":100,"is_internal_anchor":true},{"citing_arxiv_id":"2503.15558","citing_title":"Cosmos-Reason1: From Physical Common Sense To Embodied Reasoning","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2510.10125","citing_title":"Ctrl-World: A Controllable Generative World Model for Robot Manipulation","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2602.13193","citing_title":"Steerable Vision-Language-Action Policies for Embodied Reasoning and Hierarchical Control","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2602.20231","citing_title":"UniLACT: Depth-Aware RGB Latent Action Learning for Vision-Language-Action Models","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22003","citing_title":"VP-VLA: Visual Prompting as an Interface for Vision-Language-Action Models","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2603.25044","citing_title":"ThermoAct:Thermal-Aware Vision-Language-Action Models for Robotic Perception and Decision-Making","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2511.20857","citing_title":"Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory","ref_index":242,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5","json":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5.json","graph_json":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/graph.json","events_json":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/events.json","paper":"https://pith.science/paper/MVQLCYDA"},"agent_actions":{"view_html":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5","download_json":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5.json","view_paper":"https://pith.science/paper/MVQLCYDA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.19417&json=true","fetch_graph":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/graph.json","fetch_events":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/storage_attestation","attest_author":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/author_attestation","sign_citation":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/citation_signature","submit_replication":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/replication_record"}},"created_at":"2026-05-17T23:38:49.872505+00:00","updated_at":"2026-05-17T23:38:49.872505+00:00"}