{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:COEFE7A34D7QPEHJJTWNZI2CWL","short_pith_number":"pith:COEFE7A3","schema_version":"1.0","canonical_sha256":"1388527c1be0ff0790e94cecdca342b2e07afbd71245ff8adaded854e6a29b1d","source":{"kind":"arxiv","id":"2606.01955","version":1},"attestation_state":"computed","paper":{"title":"WALL-WM: Carving World Action Modeling at the Event Joints","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Charles Yang, Chris Pan, Colin Ye, Elise Mon, Ellie Ma, Ethan Chen, Gody Li, Hang Su, Hao Wang, Howard Lu, James Wang, J.W. Gao, Lily Li, Lorien Shu, Lucy Liang, Maeve Zhang, Neo Li, Newton Von, Ping Yang, PS Zhang, Qian Wang, Regis Cheng, Roy Gan, Ryan Yu, Sage Yang, Shalfun Li, Truth Qu, Victor Yao, Vincent Chen, Yohann Tang, Yu Sun","submitted_at":"2026-06-01T09:14:51Z","abstract_excerpt":"WALL-WM is a World Action Model that shifts video-action learning from chunk-centric optimization to event-grounded Vision-Language-Action pretraining, using semantically coherent action events as the atomic unit of learning. Existing WAMs commonly initialize from multimodal or video foundation models and then optimize fixed-length action chunks conditioned directly on the current observation and instruction. Although convenient, this chunk-centric formulation creates a fundamental granularity mismatch. Language describes semantic goals and events, vision evolves through continuous scene dynam"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.01955","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.RO","submitted_at":"2026-06-01T09:14:51Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"7bf96c87557ecd3ce614ac605df94c2bc697896c4badc5e877fb904399bf7321","abstract_canon_sha256":"35351411fbd68e4485d7d460aa1d5d44076a8e107286f02fd86c3df867f12e8f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:05:01.810393Z","signature_b64":"Nw7zqio7i36ZxyKdopJ/evSggxfqmhS7THn3n7Ffncw/LpsGjcXa4bZuZxuFVJhT2Go0YzbyR3+krUsHGZIWAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1388527c1be0ff0790e94cecdca342b2e07afbd71245ff8adaded854e6a29b1d","last_reissued_at":"2026-06-02T02:05:01.809966Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:05:01.809966Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"WALL-WM: Carving World Action Modeling at the Event Joints","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Charles Yang, Chris Pan, Colin Ye, Elise Mon, Ellie Ma, Ethan Chen, Gody Li, Hang Su, Hao Wang, Howard Lu, James Wang, J.W. Gao, Lily Li, Lorien Shu, Lucy Liang, Maeve Zhang, Neo Li, Newton Von, Ping Yang, PS Zhang, Qian Wang, Regis Cheng, Roy Gan, Ryan Yu, Sage Yang, Shalfun Li, Truth Qu, Victor Yao, Vincent Chen, Yohann Tang, Yu Sun","submitted_at":"2026-06-01T09:14:51Z","abstract_excerpt":"WALL-WM is a World Action Model that shifts video-action learning from chunk-centric optimization to event-grounded Vision-Language-Action pretraining, using semantically coherent action events as the atomic unit of learning. Existing WAMs commonly initialize from multimodal or video foundation models and then optimize fixed-length action chunks conditioned directly on the current observation and instruction. Although convenient, this chunk-centric formulation creates a fundamental granularity mismatch. Language describes semantic goals and events, vision evolves through continuous scene dynam"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.01955","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.01955/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.01955","created_at":"2026-06-02T02:05:01.810024+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.01955v1","created_at":"2026-06-02T02:05:01.810024+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.01955","created_at":"2026-06-02T02:05:01.810024+00:00"},{"alias_kind":"pith_short_12","alias_value":"COEFE7A34D7Q","created_at":"2026-06-02T02:05:01.810024+00:00"},{"alias_kind":"pith_short_16","alias_value":"COEFE7A34D7QPEHJ","created_at":"2026-06-02T02:05:01.810024+00:00"},{"alias_kind":"pith_short_8","alias_value":"COEFE7A3","created_at":"2026-06-02T02:05:01.810024+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL","json":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL.json","graph_json":"https://pith.science/api/pith-number/COEFE7A34D7QPEHJJTWNZI2CWL/graph.json","events_json":"https://pith.science/api/pith-number/COEFE7A34D7QPEHJJTWNZI2CWL/events.json","paper":"https://pith.science/paper/COEFE7A3"},"agent_actions":{"view_html":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL","download_json":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL.json","view_paper":"https://pith.science/paper/COEFE7A3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.01955&json=true","fetch_graph":"https://pith.science/api/pith-number/COEFE7A34D7QPEHJJTWNZI2CWL/graph.json","fetch_events":"https://pith.science/api/pith-number/COEFE7A34D7QPEHJJTWNZI2CWL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL/action/storage_attestation","attest_author":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL/action/author_attestation","sign_citation":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL/action/citation_signature","submit_replication":"https://pith.science/pith/COEFE7A34D7QPEHJJTWNZI2CWL/action/replication_record"}},"created_at":"2026-06-02T02:05:01.810024+00:00","updated_at":"2026-06-02T02:05:01.810024+00:00"}