{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:G3QQ7POGVFS2KAENSVZT6KAVWH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"76a3f6d7ee634029e7ff8b78b41a96b2433bb8e5c205dadbadab367e80e0734e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T13:40:31Z","title_canon_sha256":"322f5e2aeeb7abcf2eaa5cfb19c085f9141a433850de629f89519f63ac7d2135"},"schema_version":"1.0","source":{"id":"2605.13527","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13527","created_at":"2026-05-18T02:44:24Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13527v2","created_at":"2026-05-18T02:44:24Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13527","created_at":"2026-05-18T02:44:24Z"},{"alias_kind":"pith_short_12","alias_value":"G3QQ7POGVFS2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"G3QQ7POGVFS2KAEN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"G3QQ7POG","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:b12874d69c61da084059ff7dc7732930a3772027f15652a3e9c4228dfaf9b06b","target":"graph","created_at":"2026-05-18T02:44:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments across GUI and game-based visual-agent benchmarks show that MMSkills consistently improve both frontier and smaller multimodal agents, suggesting that external multimodal procedural knowledge complements model-internal priors."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the generated multimodal skills (state cards and keyframes) can be consulted at inference time without excessive image context or over-anchoring to reference screenshots, as stated in the problem formalization."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MMSkills creates compact multimodal skill packages from trajectories and uses a branch-loaded agent to improve visual decision-making on GUI and game benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"MMSkills equips visual agents with reusable packages of textual procedures, state cards, and multi-view keyframes derived from public trajectories."}],"snapshot_sha256":"6990a2f14f9297a938c64c8b702134c495a23e7c1f976aac4c750ef40dcb9c18"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"186e2fdfce257022b59ff4677db6f9dbf000940cfb3297498f1499fe429f58c6"},"paper":{"abstract_excerpt":"Reusable skills have become a core substrate for improving agent capabilities, yet most existing skill packages encode reusable behavior primarily as textual prompts, executable code, or learned routines. For visual agents, however, procedural knowledge is inherently multimodal: reuse depends not only on what operation to perform, but also on recognizing the relevant state, interpreting visual evidence of progress or failure, and deciding what to do next. We formalize this requirement as multimodal procedural knowledge and address three practical challenges: (I) what a multimodal skill package","authors_text":"Jianghao Lin, Kangning Zhang, Lingyue Fu, Qingyao Li, Shijian Wang, Shuai Shao, Weinan Zhang, Weiwen Liu, Wenxiang Jiao, Yong Yu, Yuan Lu","cross_cats":[],"headline":"MMSkills equips visual agents with reusable packages of textual procedures, state cards, and multi-view keyframes derived from public trajectories.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T13:40:31Z","title":"MMSkills: Towards Multimodal Skills for General Visual Agents"},"references":{"count":40,"internal_anchors":21,"resolved_work":40,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Agent s: An open agentic framework that uses computers like a human","work_id":"743d8ba1-e742-4bc5-9035-81021a53b57a","year":null},{"cited_arxiv_id":"2204.01691","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Do As I Can, Not As I Say: Grounding Language in Robotic Affordances","work_id":"037320f1-b0a9-4cbe-a639-bfb25409ce71","year":null},{"cited_arxiv_id":"2603.02766","doi":"","is_internal_anchor":true,"ref_index":3,"title":"EvoSkill: Automated Skill Discovery for Multi-Agent Systems","work_id":"94c2c7b2-0c94-4b32-90cc-bc154d192850","year":null},{"cited_arxiv_id":"2511.21631","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","year":null},{"cited_arxiv_id":"2308.14508","doi":"","is_internal_anchor":true,"ref_index":5,"title":"LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding","work_id":"ba7831c4-9427-4e0e-a5c1-4e98511f4b53","year":null}],"snapshot_sha256":"d85502b48760f2d85b2c0b9aef49e4ececd2dda8f0233f388c562f1de307f9ab"},"source":{"id":"2605.13527","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T05:55:26.129211Z","id":"def43f87-7d22-42a4-b4a8-13aef74f878d","model_set":{"reader":"grok-4.3"},"one_line_summary":"MMSkills creates compact multimodal skill packages from trajectories and uses a branch-loaded agent to improve visual decision-making on GUI and game benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"MMSkills equips visual agents with reusable packages of textual procedures, state cards, and multi-view keyframes derived from public trajectories.","strongest_claim":"Experiments across GUI and game-based visual-agent benchmarks show that MMSkills consistently improve both frontier and smaller multimodal agents, suggesting that external multimodal procedural knowledge complements model-internal priors.","weakest_assumption":"That the generated multimodal skills (state cards and keyframes) can be consulted at inference time without excessive image context or over-anchoring to reference screenshots, as stated in the problem formalization."}},"verdict_id":"def43f87-7d22-42a4-b4a8-13aef74f878d"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a00acbc7c9f2fc5dfcbd4996f750e681efbde77b73bbf27fec5d1d20fb4f1ce8","target":"record","created_at":"2026-05-18T02:44:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"76a3f6d7ee634029e7ff8b78b41a96b2433bb8e5c205dadbadab367e80e0734e","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T13:40:31Z","title_canon_sha256":"322f5e2aeeb7abcf2eaa5cfb19c085f9141a433850de629f89519f63ac7d2135"},"schema_version":"1.0","source":{"id":"2605.13527","kind":"arxiv","version":2}},"canonical_sha256":"36e10fbdc6a965a5008d95733f2815b1c1d841e6e556ada5757e391960041f88","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"36e10fbdc6a965a5008d95733f2815b1c1d841e6e556ada5757e391960041f88","first_computed_at":"2026-05-18T02:44:24.291370Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:24.291370Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/9DtmnbpsGtw1oCdULa+pNfnp3XWbUl7lnVu3Nhdohh4417DOt9/G8jQOQLYzBf2+MSnonCgQiz0XnseuGcxBg==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:24.291803Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13527","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a00acbc7c9f2fc5dfcbd4996f750e681efbde77b73bbf27fec5d1d20fb4f1ce8","sha256:b12874d69c61da084059ff7dc7732930a3772027f15652a3e9c4228dfaf9b06b"],"state_sha256":"3c0921db74af10a3f616c2e4120609debbe9fa11b503a8d313cfbac07968385f"}