{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:KM3WXRCXQZGCJ55O4D6TXOUWBP","short_pith_number":"pith:KM3WXRCX","schema_version":"1.0","canonical_sha256":"53376bc457864c24f7aee0fd3bba960bfe41ef149162b4759a41fafe5e279d33","source":{"kind":"arxiv","id":"2512.01715","version":2},"attestation_state":"computed","paper":{"title":"Transport Discrepancy as a Reliability Signal for Vision-Language-Action Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Chaoyi Xu, Hao Luo, Haoqi Yuan, Qin Jin, Sipeng Zheng, Wanpeng Zhang, Ye Wang, Yicheng Feng, Zongqing Lu","submitted_at":"2025-12-01T14:21:15Z","abstract_excerpt":"Vision-language-action (VLA) models that generate continuous action chunks via flow matching lack an internal signal for judging whether a given prediction is reliable. Distribution shift and long-horizon rollouts can push backbone representations away from the region the action head decodes reliably, yet the policy has no mechanism to detect or react to this drift. We observe that the cost of transporting observation features to the action representation in a shared feature space rises precisely when such drift occurs, providing a per-step reliability estimate without extra supervision. Build"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.01715","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-12-01T14:21:15Z","cross_cats_sorted":[],"title_canon_sha256":"744d50a9e67bbd25aba92cb4a134a31f2265024a335ad7295a96d69e30429f71","abstract_canon_sha256":"7f6ac913f837bfce4b10550e2011e601cf6ef4a3725c6088292874b19e97322a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-03T00:16:51.396240Z","signature_b64":"WJXJrfwHQykH8cz7l1SoCwJDEWS81AKHNUgPvok4gU6ibLwAtZDBKJt3WKm6X9rCgwBIOVoiG9Z2zPBOrmeABg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"53376bc457864c24f7aee0fd3bba960bfe41ef149162b4759a41fafe5e279d33","last_reissued_at":"2026-07-03T00:16:51.395818Z","signature_status":"signed_v1","first_computed_at":"2026-07-03T00:16:51.395818Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Transport Discrepancy as a Reliability Signal for Vision-Language-Action Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Chaoyi Xu, Hao Luo, Haoqi Yuan, Qin Jin, Sipeng Zheng, Wanpeng Zhang, Ye Wang, Yicheng Feng, Zongqing Lu","submitted_at":"2025-12-01T14:21:15Z","abstract_excerpt":"Vision-language-action (VLA) models that generate continuous action chunks via flow matching lack an internal signal for judging whether a given prediction is reliable. Distribution shift and long-horizon rollouts can push backbone representations away from the region the action head decodes reliably, yet the policy has no mechanism to detect or react to this drift. We observe that the cost of transporting observation features to the action representation in a shared feature space rises precisely when such drift occurs, providing a per-step reliability estimate without extra supervision. Build"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.01715","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.01715/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.01715","created_at":"2026-07-03T00:16:51.395875+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.01715v2","created_at":"2026-07-03T00:16:51.395875+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.01715","created_at":"2026-07-03T00:16:51.395875+00:00"},{"alias_kind":"pith_short_12","alias_value":"KM3WXRCXQZGC","created_at":"2026-07-03T00:16:51.395875+00:00"},{"alias_kind":"pith_short_16","alias_value":"KM3WXRCXQZGCJ55O","created_at":"2026-07-03T00:16:51.395875+00:00"},{"alias_kind":"pith_short_8","alias_value":"KM3WXRCX","created_at":"2026-07-03T00:16:51.395875+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2606.19531","citing_title":"ImageWAM: Do World Action Models Really Need Video Generation, or Just Image Editing?","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2606.17924","citing_title":"PearlVLA: Progressive Embodied Action-Plan Refinement in Latent Space","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2606.07100","citing_title":"LARA: Latent Action Representation Alignment for Vision-Language-Action Models","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2607.01067","citing_title":"Human-Centric Transferable Tactile Pre-Training for Dexterous Robotic Manipulation","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2606.07100","citing_title":"LARA: Latent Action Representation Alignment for Vision-Language-Action Models","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07381","citing_title":"Escaping the Diversity Trap in Robotic Manipulation via Anchor-Centric Adaptation","ref_index":70,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP","json":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP.json","graph_json":"https://pith.science/api/pith-number/KM3WXRCXQZGCJ55O4D6TXOUWBP/graph.json","events_json":"https://pith.science/api/pith-number/KM3WXRCXQZGCJ55O4D6TXOUWBP/events.json","paper":"https://pith.science/paper/KM3WXRCX"},"agent_actions":{"view_html":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP","download_json":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP.json","view_paper":"https://pith.science/paper/KM3WXRCX","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.01715&json=true","fetch_graph":"https://pith.science/api/pith-number/KM3WXRCXQZGCJ55O4D6TXOUWBP/graph.json","fetch_events":"https://pith.science/api/pith-number/KM3WXRCXQZGCJ55O4D6TXOUWBP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP/action/storage_attestation","attest_author":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP/action/author_attestation","sign_citation":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP/action/citation_signature","submit_replication":"https://pith.science/pith/KM3WXRCXQZGCJ55O4D6TXOUWBP/action/replication_record"}},"created_at":"2026-07-03T00:16:51.395875+00:00","updated_at":"2026-07-03T00:16:51.395875+00:00"}