{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:JYME5X3PIZH3HHW2Q6NRPWEO5Z","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"fdf92f640d25835c64e3b0925c955c67568a9c4430d921efe72886ad8b4d3d45","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-21T08:04:02Z","title_canon_sha256":"7cc9f0fbdad3f022736064eafce86de064a9f35eee9f802069c85b2590541a18"},"schema_version":"1.0","source":{"id":"2604.19193","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.19193","created_at":"2026-06-26T01:15:18Z"},{"alias_kind":"arxiv_version","alias_value":"2604.19193v1","created_at":"2026-06-26T01:15:18Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.19193","created_at":"2026-06-26T01:15:18Z"},{"alias_kind":"pith_short_12","alias_value":"JYME5X3PIZH3","created_at":"2026-06-26T01:15:18Z"},{"alias_kind":"pith_short_16","alias_value":"JYME5X3PIZH3HHW2","created_at":"2026-06-26T01:15:18Z"},{"alias_kind":"pith_short_8","alias_value":"JYME5X3P","created_at":"2026-06-26T01:15:18Z"}],"graph_snapshots":[{"event_id":"sha256:4defcf1100ba0e7f99a4924862144dd5c59e3623eef8adbb6c60c3bee88c6708","target":"graph","created_at":"2026-06-26T01:15:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"while state-of-the-art (SOTA) video models, such as Seedance 2.0, demonstrate competence on certain understanding and reasoning subtasks, they fall substantially short with logically grounded and interactive generation tasks (achieving success rates <25% and ~0%, respectively)"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the manually annotated CLVG-Bench tasks and the Adaptive Video Evaluator (AVE) accurately capture and measure 'true multimodal reasoning' in a manner that aligns with human expert perception without bias or incompleteness."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Current video models succeed on basic understanding but achieve under 25% success on logically grounded generation and near 0% on interactive generation, exposing gaps in multimodal reasoning."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"State-of-the-art video models handle basic understanding but fail on logically grounded and interactive video generation tasks."}],"snapshot_sha256":"52bbf7abac39505bb2440a3588517c05dd48cad7292fe6a01203f8deb3e12e02"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-20T03:14:49.503249Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2604.19193/integrity.json","findings":[],"snapshot_sha256":"4b67cd8f60ae41105430b0931977282d1a9ddf6cb3bb2f3d8ffd0c72a85e2779","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Despite remarkable progress toward general-purpose video models, a critical question remains unanswered: how far are these models from achieving true multimodal reasoning? Existing benchmarks fail to address this question rigorously, as they remain constrained by straightforward task designs and fragmented evaluation metrics that neglect complex multimodal reasoning. To bridge this gap, we introduce CLVG-Bench, an evaluation framework designed to probe video models' zero-shot reasoning capabilities via Context Learning in Video Generation. CLVG-Bench comprises more than 1,000 high-quality, man","authors_text":"Daoan Zhang, Dezhi YU, Jianhui Wei, Jie Tan, Songtao Jiang, Wei Xu, Xiaotian Zhang, Yan Zhang, Yichen Li, Yuan Wang, Ziyi Chen, Zuozhu Liu","cross_cats":[],"headline":"State-of-the-art video models handle basic understanding but fail on logically grounded and interactive video generation tasks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-21T08:04:02Z","title":"How Far Are Video Models from True Multimodal Reasoning?"},"references":{"count":103,"internal_anchors":28,"resolved_work":103,"sample":[{"cited_arxiv_id":"2507.19457","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning","work_id":"40b60d06-dc1c-4799-b75d-ff1eca653049","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Oxford University Press (1984) 6","work_id":"0359ca9c-6a9c-49c7-9b25-ad2a709eb341","year":1984},{"cited_arxiv_id":"2406.03520","doi":"","is_internal_anchor":true,"ref_index":3,"title":"VideoPhy: Evaluating Physical Commonsense for Video Generation","work_id":"27ed795c-abbe-4de1-9a7a-2ecf39c354f3","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Videophy-2: A challenging action-centric physical commonsense evaluation in video generation","work_id":"4dfe3980-dfd5-4917-95e9-179c29e4eb18","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Bordwell, D., Thompson, K., Smith, J.: Film art: An introduction, vol. 7. McGraw- Hill New York (2008) 6","work_id":"056c85b2-c274-4c5f-ab70-f8ef335f814c","year":2008}],"snapshot_sha256":"24cb676dbadefd6b7a2415aaec5b5a0848a7c04842f88fb7e9b75534b71df1e4"},"source":{"id":"2604.19193","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-10T02:41:12.991059Z","id":"6d395670-af5a-41ab-b28b-5b8cf4557c45","model_set":{"reader":"grok-4.3"},"one_line_summary":"Current video models succeed on basic understanding but achieve under 25% success on logically grounded generation and near 0% on interactive generation, exposing gaps in multimodal reasoning.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"State-of-the-art video models handle basic understanding but fail on logically grounded and interactive video generation tasks.","strongest_claim":"while state-of-the-art (SOTA) video models, such as Seedance 2.0, demonstrate competence on certain understanding and reasoning subtasks, they fall substantially short with logically grounded and interactive generation tasks (achieving success rates <25% and ~0%, respectively)","weakest_assumption":"That the manually annotated CLVG-Bench tasks and the Adaptive Video Evaluator (AVE) accurately capture and measure 'true multimodal reasoning' in a manner that aligns with human expert perception without bias or incompleteness."}},"verdict_id":"6d395670-af5a-41ab-b28b-5b8cf4557c45"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:69ce8f7366aef8e355d6feb25d881a99f55b75d00af45d0f93518cb508d47e09","target":"record","created_at":"2026-06-26T01:15:18Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"fdf92f640d25835c64e3b0925c955c67568a9c4430d921efe72886ad8b4d3d45","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-21T08:04:02Z","title_canon_sha256":"7cc9f0fbdad3f022736064eafce86de064a9f35eee9f802069c85b2590541a18"},"schema_version":"1.0","source":{"id":"2604.19193","kind":"arxiv","version":1}},"canonical_sha256":"4e184edf6f464fb39eda879b17d88eee7b5845e06b3a0ead8c45f9eb3771784f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4e184edf6f464fb39eda879b17d88eee7b5845e06b3a0ead8c45f9eb3771784f","first_computed_at":"2026-06-26T01:15:18.907568Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-26T01:15:18.907568Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"31BHe44ITtyfofjyvb2C058d58Fm0Naqn/cxLazKCsyegbhLATwgPu6VthjCBRJqDNQ5YHaGeliMbYDOIW/lAg==","signature_status":"signed_v1","signed_at":"2026-06-26T01:15:18.908030Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.19193","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:69ce8f7366aef8e355d6feb25d881a99f55b75d00af45d0f93518cb508d47e09","sha256:4defcf1100ba0e7f99a4924862144dd5c59e3623eef8adbb6c60c3bee88c6708"],"state_sha256":"afe178d4f5de23eeb665288b17698e97f4e95320e3453c4def9572ec9c419150"}