{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:RDI4RZGZ2IS7TIMSM7WPIBC4BX","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"8df7b623dddd4a4075d05f7a8df1784604860d9c0984f680b8368e1ffc14d47a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-01-21T18:59:00Z","title_canon_sha256":"dde1b16d0f441e24c50b224a891f44b330b55b20a8347dab0577c123db7179f2"},"schema_version":"1.0","source":{"id":"2501.12386","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2501.12386","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2501.12386v3","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.12386","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"RDI4RZGZ2IS7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"RDI4RZGZ2IS7TIMS","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"RDI4RZGZ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8bd2d5fb4bf30ce452e2c2a67609aeb65756b87e030a99facd028aecdd20d035","target":"graph","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results demonstrate this unique design of LRC greatly improves the results of video MLLM in mainstream video understanding benchmarks (short & long), enabling the MLLM to memorize significantly longer video inputs (at least 6x longer than the original), and master specialized vision capabilities like object tracking and segmentation."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The reported gains in context length, benchmark scores, and specialized vision tasks are attributable to the long and rich context modeling components rather than differences in training data volume, model scale, or benchmark selection."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"InternVideo2.5 improves video MLLMs by incorporating dense vision task annotations via direct preference optimization and compact spatiotemporal representations via adaptive hierarchical token compression, yielding better benchmark performance, 6x longer video memory, and new capabilities likeobject"},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Long and rich context modeling lets video MLLMs process at least six times longer inputs while gaining object tracking and segmentation skills."}],"snapshot_sha256":"24e1d7d83ef4e036b61f00a07dc07aaa2f66a25da1b818be50d48360e5c0ae1a"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"1bee279f6d7aa06c23cc332c99b11764921af6e981a03f17b989cd711387185a"},"paper":{"abstract_excerpt":"This paper aims to improve the performance of video multimodal large language models (MLLM) via long and rich context (LRC) modeling. As a result, we develop a new version of InternVideo2.5 with a focus on enhancing the original MLLMs' ability to perceive fine-grained details and capture long-form temporal structure in videos. Specifically, our approach incorporates dense vision task annotations into MLLMs using direct preference optimization and develops compact spatiotemporal representations through adaptive hierarchical token compression. Experimental results demonstrate this unique design ","authors_text":"Changlian Ma, Chenting Wang, Haian Huang, Jianfei Gao, Jiashuo Yu, Kai Chen, Limin Wang, Min Dou, Wenhai Wang, Xiangyu Zeng, Xinhao Li, Yali Wang, Yinan He, Yi Wang, Yu Qiao, Ziang Yan","cross_cats":[],"headline":"Long and rich context modeling lets video MLLMs process at least six times longer inputs while gaining object tracking and segmentation skills.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-01-21T18:59:00Z","title":"InternVideo2.5: Empowering Video MLLMs with Long and Rich Context Modeling"},"references":{"count":37,"internal_anchors":16,"resolved_work":37,"sample":[{"cited_arxiv_id":"2501.03575","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","year":null},{"cited_arxiv_id":"2309.16609","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"One token to seg them all: Language instructed reasoning segmentation in videos","work_id":"3968ae62-2a48-4da8-8638-9234980a83dd","year":null},{"cited_arxiv_id":"2210.09461","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Token Merging: Your ViT But Faster","work_id":"528509bc-2611-4e7f-a772-ea14d25b6dae","year":null},{"cited_arxiv_id":"2403.17297","doi":"","is_internal_anchor":true,"ref_index":5,"title":"InternLM2 Technical Report","work_id":"dfa13e0e-1c3c-4fb6-943d-a19945bacdbe","year":null}],"snapshot_sha256":"b621385528647ffa98231a61c588bc3ff5ec6999c9cb8367a04402497dbc9689"},"source":{"id":"2501.12386","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T02:47:18.187982Z","id":"c5a786dc-626e-4ef8-80e5-fe0d71300634","model_set":{"reader":"grok-4.3"},"one_line_summary":"InternVideo2.5 improves video MLLMs by incorporating dense vision task annotations via direct preference optimization and compact spatiotemporal representations via adaptive hierarchical token compression, yielding better benchmark performance, 6x longer video memory, and new capabilities likeobject","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Long and rich context modeling lets video MLLMs process at least six times longer inputs while gaining object tracking and segmentation skills.","strongest_claim":"Experimental results demonstrate this unique design of LRC greatly improves the results of video MLLM in mainstream video understanding benchmarks (short & long), enabling the MLLM to memorize significantly longer video inputs (at least 6x longer than the original), and master specialized vision capabilities like object tracking and segmentation.","weakest_assumption":"The reported gains in context length, benchmark scores, and specialized vision tasks are attributable to the long and rich context modeling components rather than differences in training data volume, model scale, or benchmark selection."}},"verdict_id":"c5a786dc-626e-4ef8-80e5-fe0d71300634"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:431887dbe42647cd62f5b28bdeaf9c5c18957feebd2e69695832f35b2f53476b","target":"record","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"8df7b623dddd4a4075d05f7a8df1784604860d9c0984f680b8368e1ffc14d47a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-01-21T18:59:00Z","title_canon_sha256":"dde1b16d0f441e24c50b224a891f44b330b55b20a8347dab0577c123db7179f2"},"schema_version":"1.0","source":{"id":"2501.12386","kind":"arxiv","version":3}},"canonical_sha256":"88d1c8e4d9d225f9a19267ecf4045c0ddf7862abce6668f060d7fca71f012c87","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"88d1c8e4d9d225f9a19267ecf4045c0ddf7862abce6668f060d7fca71f012c87","first_computed_at":"2026-05-17T23:38:15.344963Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:15.344963Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"bPF1S2+dVntQzuvqENyLDCFdlLy6G8DvYg7w/wmVtlMkTgHf44c7nwkhbxE45tKEFcCLO9A1DmgOG1NJhXIrCA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:15.345519Z","signed_message":"canonical_sha256_bytes"},"source_id":"2501.12386","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:431887dbe42647cd62f5b28bdeaf9c5c18957feebd2e69695832f35b2f53476b","sha256:8bd2d5fb4bf30ce452e2c2a67609aeb65756b87e030a99facd028aecdd20d035"],"state_sha256":"fb206156aa1705a6026cc3200e27c562a908347cda3eb600077bd788966708e0"}