{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:6KGN5CWDOJPCJIRMNPLXFEL6XK","short_pith_number":"pith:6KGN5CWD","schema_version":"1.0","canonical_sha256":"f28cde8ac3725e24a22c6bd772917ebaad503b6ff104cdc66de1ebeb4d7d3738","source":{"kind":"arxiv","id":"2603.14145","version":2},"attestation_state":"computed","paper":{"title":"MMOU: A Massive Multi-Task Omni Understanding and Reasoning Benchmark for Long and Complex Real-World Videos","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.CL","authors_text":"Abhinav Shrivastava, Andrew Tao, Arushi Goel, Bryan Catanzaro, Dinesh Manocha, James Case, Kaousheik Jayakumar, Karan Sapra, Katie Lyons, Kevin J. Shih, Lasha Koroshinadze, Mohammad Shoeybi, Nishit Anand, Ramani Duraiswami, Siddharth Gururani, Sreyan Ghosh, Vatsal Agarwal, Wei Ping, Yao Xu","submitted_at":"2026-03-14T22:28:38Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) have shown strong performance in visual and audio understanding when evaluated in isolation. However, their ability to jointly reason over omni-modal (visual, audio, and textual) signals in long and complex videos remains largely unexplored. We introduce MMOU, a new benchmark designed to systematically evaluate multimodal understanding and reasoning under these challenging, real-world conditions. MMOU consists of 20,000 carefully curated questions paired with 11877 web-collected videos of varying length, spanning diverse domains and exhibiting rich, tig"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.14145","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-03-14T22:28:38Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"736af3a480f5bdc8b6a64a41b8aaa315649bcdcf102292349865855719d95c99","abstract_canon_sha256":"616629d2b228614f8009e34e4909395d055dac1106fa6e639df36bd4871e3a79"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T02:12:47.900544Z","signature_b64":"PG4TH7rfeQFjcEXWf/2MH9nZ2kSpuk/mnZYuSECUWAUIAna8YA0aRJd3V+RlXPqsWcOm2sa4VW7c56rEuowiAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f28cde8ac3725e24a22c6bd772917ebaad503b6ff104cdc66de1ebeb4d7d3738","last_reissued_at":"2026-06-23T02:12:47.900063Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T02:12:47.900063Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MMOU: A Massive Multi-Task Omni Understanding and Reasoning Benchmark for Long and Complex Real-World Videos","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.CL","authors_text":"Abhinav Shrivastava, Andrew Tao, Arushi Goel, Bryan Catanzaro, Dinesh Manocha, James Case, Kaousheik Jayakumar, Karan Sapra, Katie Lyons, Kevin J. Shih, Lasha Koroshinadze, Mohammad Shoeybi, Nishit Anand, Ramani Duraiswami, Siddharth Gururani, Sreyan Ghosh, Vatsal Agarwal, Wei Ping, Yao Xu","submitted_at":"2026-03-14T22:28:38Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) have shown strong performance in visual and audio understanding when evaluated in isolation. However, their ability to jointly reason over omni-modal (visual, audio, and textual) signals in long and complex videos remains largely unexplored. We introduce MMOU, a new benchmark designed to systematically evaluate multimodal understanding and reasoning under these challenging, real-world conditions. MMOU consists of 20,000 carefully curated questions paired with 11877 web-collected videos of varying length, spanning diverse domains and exhibiting rich, tig"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.14145","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.14145/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.14145","created_at":"2026-06-23T02:12:47.900121+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.14145v2","created_at":"2026-06-23T02:12:47.900121+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.14145","created_at":"2026-06-23T02:12:47.900121+00:00"},{"alias_kind":"pith_short_12","alias_value":"6KGN5CWDOJPC","created_at":"2026-06-23T02:12:47.900121+00:00"},{"alias_kind":"pith_short_16","alias_value":"6KGN5CWDOJPCJIRM","created_at":"2026-06-23T02:12:47.900121+00:00"},{"alias_kind":"pith_short_8","alias_value":"6KGN5CWD","created_at":"2026-06-23T02:12:47.900121+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2606.00579","citing_title":"Sandboxed Coding Agents are Competitive Omni-modal Task Solvers","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17360","citing_title":"Omni-DuplexEval: Evaluating Real-time Duplex Omni-modal Interaction","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08762","citing_title":"Omni-DeepSearch: A Benchmark for Audio-Driven Omni-Modal Deep Search","ref_index":18,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK","json":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK.json","graph_json":"https://pith.science/api/pith-number/6KGN5CWDOJPCJIRMNPLXFEL6XK/graph.json","events_json":"https://pith.science/api/pith-number/6KGN5CWDOJPCJIRMNPLXFEL6XK/events.json","paper":"https://pith.science/paper/6KGN5CWD"},"agent_actions":{"view_html":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK","download_json":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK.json","view_paper":"https://pith.science/paper/6KGN5CWD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.14145&json=true","fetch_graph":"https://pith.science/api/pith-number/6KGN5CWDOJPCJIRMNPLXFEL6XK/graph.json","fetch_events":"https://pith.science/api/pith-number/6KGN5CWDOJPCJIRMNPLXFEL6XK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK/action/storage_attestation","attest_author":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK/action/author_attestation","sign_citation":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK/action/citation_signature","submit_replication":"https://pith.science/pith/6KGN5CWDOJPCJIRMNPLXFEL6XK/action/replication_record"}},"created_at":"2026-06-23T02:12:47.900121+00:00","updated_at":"2026-06-23T02:12:47.900121+00:00"}