{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:LO3POELOJU7SV3REL44Y4PFZO4","short_pith_number":"pith:LO3POELO","schema_version":"1.0","canonical_sha256":"5bb6f7116e4d3f2aee245f398e3cb9770240eb0cc24abb123ccd5e97e75e1c3c","source":{"kind":"arxiv","id":"2606.18955","version":1},"attestation_state":"computed","paper":{"title":"Motion-Focused Latent Action Enables Cross-Embodiment VLA Training from Human EgoVideos","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Jian Wang, Jincheng Yu, Runze Xu, Yiluo Zhang, Yu Wang","submitted_at":"2026-06-17T11:37:59Z","abstract_excerpt":"Training generalist Vision-Language-Action(VLA) models typically requires massive, diverse robotic datasets with high-fidelity action annotations. While egocentric human manipulation videos are abundant and capture significant environmental diversity, the absence of action labels makes them difficult to use in conventional training paradigms. To address this, we propose a latent-action-based framework designed to extract general action priors from unlabeled human videos. The architecture features a Hybrid Disentangled VQ-VAE that decouples motion dynamics from environmental backgrounds through"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.18955","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-06-17T11:37:59Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"867022c505d9da2faa4c01bacfbc4d95a82fae6600cccd5e4d50e4e24e30f17c","abstract_canon_sha256":"1b7f4d5dc38e68a3371333d2b8d1582825362f1e8c4bc98d91e1cad0aeb0266c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:11:52.850524Z","signature_b64":"n7z4/cloLRHFVWJiQPEfcXgG0kTNNwwdAJALRXQUfKLmu/ZGeuuF+iMap5VAZwe7uBPqROV5qayWCsuXH+CTDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5bb6f7116e4d3f2aee245f398e3cb9770240eb0cc24abb123ccd5e97e75e1c3c","last_reissued_at":"2026-06-19T16:11:52.850126Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:11:52.850126Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Motion-Focused Latent Action Enables Cross-Embodiment VLA Training from Human EgoVideos","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Jian Wang, Jincheng Yu, Runze Xu, Yiluo Zhang, Yu Wang","submitted_at":"2026-06-17T11:37:59Z","abstract_excerpt":"Training generalist Vision-Language-Action(VLA) models typically requires massive, diverse robotic datasets with high-fidelity action annotations. While egocentric human manipulation videos are abundant and capture significant environmental diversity, the absence of action labels makes them difficult to use in conventional training paradigms. To address this, we propose a latent-action-based framework designed to extract general action priors from unlabeled human videos. The architecture features a Hybrid Disentangled VQ-VAE that decouples motion dynamics from environmental backgrounds through"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.18955","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.18955/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.18955","created_at":"2026-06-19T16:11:52.850186+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.18955v1","created_at":"2026-06-19T16:11:52.850186+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.18955","created_at":"2026-06-19T16:11:52.850186+00:00"},{"alias_kind":"pith_short_12","alias_value":"LO3POELOJU7S","created_at":"2026-06-19T16:11:52.850186+00:00"},{"alias_kind":"pith_short_16","alias_value":"LO3POELOJU7SV3RE","created_at":"2026-06-19T16:11:52.850186+00:00"},{"alias_kind":"pith_short_8","alias_value":"LO3POELO","created_at":"2026-06-19T16:11:52.850186+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4","json":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4.json","graph_json":"https://pith.science/api/pith-number/LO3POELOJU7SV3REL44Y4PFZO4/graph.json","events_json":"https://pith.science/api/pith-number/LO3POELOJU7SV3REL44Y4PFZO4/events.json","paper":"https://pith.science/paper/LO3POELO"},"agent_actions":{"view_html":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4","download_json":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4.json","view_paper":"https://pith.science/paper/LO3POELO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.18955&json=true","fetch_graph":"https://pith.science/api/pith-number/LO3POELOJU7SV3REL44Y4PFZO4/graph.json","fetch_events":"https://pith.science/api/pith-number/LO3POELOJU7SV3REL44Y4PFZO4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4/action/storage_attestation","attest_author":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4/action/author_attestation","sign_citation":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4/action/citation_signature","submit_replication":"https://pith.science/pith/LO3POELOJU7SV3REL44Y4PFZO4/action/replication_record"}},"created_at":"2026-06-19T16:11:52.850186+00:00","updated_at":"2026-06-19T16:11:52.850186+00:00"}