{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:7XCJPIKRCOGJ7CQH6DK2A4AIP5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f1688adf80f2975bf8fa10833ac208bb0ccc94eaf0296ecdbc44f220013acc4a","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T09:19:22Z","title_canon_sha256":"94b40a10610ce69c1fd17eef5f72ab6985bb50e9173e1bff79511a606bf95cf8"},"schema_version":"1.0","source":{"id":"2605.13228","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13228","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13228v1","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13228","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"pith_short_12","alias_value":"7XCJPIKRCOGJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"7XCJPIKRCOGJ7CQH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"7XCJPIKR","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:a34cdbcba9c4950bb59632b9d2f9f4a367e43e5a667681b729f471e120787dda","target":"graph","created_at":"2026-05-18T02:44:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments on MVBench, MLVU, and Video-MME w/o sub. show that ReTool-Video consistently outperforms strong baselines. Further analysis demonstrates that recursive grounding and fine-grained meta tools improve the stability and effectiveness of complex video understanding."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That high-level video intents can be reliably matched or decomposed by the resolver into the 134 registered tools without introducing errors, excessive recursion, or loss of reasoning fidelity."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ReTool-Video uses a 134-tool meta-augmented library and recursive grounding to translate abstract video intents into fine-grained multimodal operations, outperforming baselines on MVBench, MLVU, and Video-MME."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"ReTool-Video recursively grounds abstract video intents into executable tool chains using a library of 134 meta-augmented tools."}],"snapshot_sha256":"da242774fea16a4a4d0c5680a18ae5c657dc379c6a5a17aabceadd0d270362f3"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Video understanding requires active evidence seeking, motivating tool-augmented video agents for temporal reasoning, cross-modal understanding, and complex question answering. Existing video agents have improved video reasoning with retrieval, memory, frame inspection, and verifier tools, but they still face two limitations: (1) a coarse tool space that lacks fine-grained operations for compositional reasoning; and (2) a flat action space that forces high-level video intents into primitive executable tool calls. In this paper, we address these challenges with two complementary designs. First, ","authors_text":"Changjian Wang, Guohui Xiang, Jiang Zhong, Junnan Zhu, KaiWen Wei, Nayu Liu, Rongzhen Li, Ruirui Chen, Xiao Liu","cross_cats":["cs.AI"],"headline":"ReTool-Video recursively grounds abstract video intents into executable tool chains using a library of 134 meta-augmented tools.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T09:19:22Z","title":"ReTool-Video: Recursive Tool-Using Video Agents with Meta-Augmented Tool Grounding"},"references":{"count":89,"internal_anchors":11,"resolved_work":89,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Model System Cards","work_id":"48f69590-3d62-41e5-87e8-e792337e716a","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Sharegpt4video: Improving video understanding and generation with better captions.Advances in Neural Information Processing Systems, 37:19472–19495, 2024","work_id":"9b40200f-b968-41d0-b007-b4deebd1b256","year":2024},{"cited_arxiv_id":"2406.07476","doi":"","is_internal_anchor":true,"ref_index":3,"title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs","work_id":"ccfc3f89-c510-45f1-8a35-ed1a56c0ae5c","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Video question answering with procedural programs","work_id":"a31fff1b-ca21-4c20-bbac-fbad471f690a","year":2024},{"cited_arxiv_id":"2507.06261","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","year":2025}],"snapshot_sha256":"64acdeda5f654aa9d140477dac2d5590cd096eefd333f32db4c0d6996eb8ab0b"},"source":{"id":"2605.13228","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T20:04:07.533132Z","id":"0a4c5d12-6012-421a-ab93-d5da07c29031","model_set":{"reader":"grok-4.3"},"one_line_summary":"ReTool-Video uses a 134-tool meta-augmented library and recursive grounding to translate abstract video intents into fine-grained multimodal operations, outperforming baselines on MVBench, MLVU, and Video-MME.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"ReTool-Video recursively grounds abstract video intents into executable tool chains using a library of 134 meta-augmented tools.","strongest_claim":"Experiments on MVBench, MLVU, and Video-MME w/o sub. show that ReTool-Video consistently outperforms strong baselines. Further analysis demonstrates that recursive grounding and fine-grained meta tools improve the stability and effectiveness of complex video understanding.","weakest_assumption":"That high-level video intents can be reliably matched or decomposed by the resolver into the 134 registered tools without introducing errors, excessive recursion, or loss of reasoning fidelity."}},"verdict_id":"0a4c5d12-6012-421a-ab93-d5da07c29031"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9d327badbe300131eb7fad514f71de2132ca90a6a566523b8794a38e66a8971e","target":"record","created_at":"2026-05-18T02:44:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f1688adf80f2975bf8fa10833ac208bb0ccc94eaf0296ecdbc44f220013acc4a","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T09:19:22Z","title_canon_sha256":"94b40a10610ce69c1fd17eef5f72ab6985bb50e9173e1bff79511a606bf95cf8"},"schema_version":"1.0","source":{"id":"2605.13228","kind":"arxiv","version":1}},"canonical_sha256":"fdc497a151138c9f8a07f0d5a070087f6ffd930ccc1a1e41edb1c2f35c299013","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fdc497a151138c9f8a07f0d5a070087f6ffd930ccc1a1e41edb1c2f35c299013","first_computed_at":"2026-05-18T02:44:49.613036Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:49.613036Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"t5aah8i1G5xMtf678yP/C1y23LNnZRneviiHsyfcga67hBMTv47qQRkMjhzg1ixe+/+/iT6c3z9UMwsD5aw5Ag==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:49.613506Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13228","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9d327badbe300131eb7fad514f71de2132ca90a6a566523b8794a38e66a8971e","sha256:a34cdbcba9c4950bb59632b9d2f9f4a367e43e5a667681b729f471e120787dda"],"state_sha256":"8c92ccad1d02b87323e61d9aabf338302b45a8ea471ad67fff6c7da250a49547"}