{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:QYKS73QP5M7UGDRI3L336YMWZN","short_pith_number":"pith:QYKS73QP","canonical_record":{"source":{"id":"2603.24649","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-25T17:33:58Z","cross_cats_sorted":[],"title_canon_sha256":"fec6e1ea2e5e1619efb18a52998f2632d6a64d01d7191ace85214bbaa8a0c28c","abstract_canon_sha256":"b982ecb2d05155cecaba5c4f6dbf5b75c53ba230e24fe2c3c536f6699d47585c"},"schema_version":"1.0"},"canonical_sha256":"86152fee0feb3f430e28daf7bf6196cb7a0c1d8a97479e649f107f6ed32915ce","source":{"kind":"arxiv","id":"2603.24649","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.24649","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"arxiv_version","alias_value":"2603.24649v2","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.24649","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"pith_short_12","alias_value":"QYKS73QP5M7U","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"QYKS73QP5M7UGDRI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"QYKS73QP","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:QYKS73QP5M7UGDRI3L336YMWZN","target":"record","payload":{"canonical_record":{"source":{"id":"2603.24649","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-25T17:33:58Z","cross_cats_sorted":[],"title_canon_sha256":"fec6e1ea2e5e1619efb18a52998f2632d6a64d01d7191ace85214bbaa8a0c28c","abstract_canon_sha256":"b982ecb2d05155cecaba5c4f6dbf5b75c53ba230e24fe2c3c536f6699d47585c"},"schema_version":"1.0"},"canonical_sha256":"86152fee0feb3f430e28daf7bf6196cb7a0c1d8a97479e649f107f6ed32915ce","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:30.673104Z","signature_b64":"JIf1NVKu+u5Uu3xHFR8asZQ2MMsPJSc5R0kdmzioBAp5vwbeblgFm7LBKXpZz1RIf1qR24xfJyZiQC8ZwNDTBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"86152fee0feb3f430e28daf7bf6196cb7a0c1d8a97479e649f107f6ed32915ce","last_reissued_at":"2026-05-18T02:44:30.672619Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:30.672619Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.24649","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"FlbzpiGsacLn+BbRDmSxXtiWxG4IVFmb22mKJ5B7m5TmsodaWBpFaApPDXRJRs284cCKTi8vGtvoyF79M8LgDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T10:06:13.446825Z"},"content_sha256":"ac5e576b016831de97c2c72616a8fd17b1f72183adad1eb30ec66e2471808eac","schema_version":"1.0","event_id":"sha256:ac5e576b016831de97c2c72616a8fd17b1f72183adad1eb30ec66e2471808eac"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:QYKS73QP5M7UGDRI3L336YMWZN","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MedOpenClaw and MedFlowBench: Auditing Medical Agents in Full-Study Workflows","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Medical agents must navigate full imaging studies and submit auditable evidence, where performance drops sharply compared to answer-only evaluation.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Benedikt Wiestler, Che Liu, Chengzhi Shen, Daguang Xu, Daniel Rueckert, Jiayuan Zhu, Jiazhen Pan, Jingpei Wu, Junde Wu, Min Xu, Weixiang Shen, Xiao Han, Yanzhu Hu, Yueming Jin, Zongyue Li","submitted_at":"2026-03-25T17:33:58Z","abstract_excerpt":"Medical imaging benchmarks often evaluate VLMs on pre-selected 2D images, slices, crops, or patches, making evaluation closer to visual recognition. Real clinical workflows impose a different burden: readers must search through complete studies, operate imaging software, navigate across slices and magnifications, and document visual evidence that can be audited. We argue that this evidence-producing workflow is a critical missing evaluation axis for medical imaging agents. To study it, we introduce MedFlowBench, a full-study benchmark for VLM agents, together with MedOpenClaw, a controlled and"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across evaluated models, final answer-only scoring gives an overly optimistic picture: when answers must also be supported by correct evidence, performance drops substantially on complex workflows.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the automated evidence checking against withheld masks and annotations accurately captures the requirements of real clinical auditing and that the controlled runtime faithfully represents the complexity of operating actual medical imaging software.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MedFlowBench evaluates VLM agents on full radiology and pathology studies by requiring both task answers and verifiable evidence like key slices and regions of interest, revealing that answer-only scores overestimate performance.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Medical agents must navigate full imaging studies and submit auditable evidence, where performance drops sharply compared to answer-only evaluation.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"9bbca78cc6a243a03dd1159a72e28c9c1d0c9d1be24d1e1523f7d181b1ce4567"},"source":{"id":"2603.24649","kind":"arxiv","version":2},"verdict":{"id":"b17730ae-9588-4dc6-b9f7-4e6442e750b8","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T00:04:26.523916Z","strongest_claim":"Across evaluated models, final answer-only scoring gives an overly optimistic picture: when answers must also be supported by correct evidence, performance drops substantially on complex workflows.","one_line_summary":"MedFlowBench evaluates VLM agents on full radiology and pathology studies by requiring both task answers and verifiable evidence like key slices and regions of interest, revealing that answer-only scores overestimate performance.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the automated evidence checking against withheld masks and annotations accurately captures the requirements of real clinical auditing and that the controlled runtime faithfully represents the complexity of operating actual medical imaging software.","pith_extraction_headline":"Medical agents must navigate full imaging studies and submit auditable evidence, where performance drops sharply compared to answer-only evaluation."},"references":{"count":55,"sample":[{"doi":"10.1038/sdata.2018.251","year":2018,"title":"URLhttps://www.nature.com/articles/sdata2018251","work_id":"7b69c607-ae16-42bd-8ad3-a321ba6fbf5a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Slake: A semantically- labeled knowledge-enhanced dataset for medical visual question answering","work_id":"04093b1b-9de8-4b51-8f01-acb0fb1e0b10","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Huatuogpt-vision, towards injecting medical visual knowledge into multimodal llms at scale","work_id":"dd32b8a1-4ad4-4155-b031-c317b565c6e7","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Medxpertqa: Benchmarking expert-level medical reasoning and understanding","work_id":"8be80372-d3ea-4961-a7df-4a2807abc590","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Medxpertqa dataset card","work_id":"1c125ac9-c747-49bb-91f8-7f0df325d881","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":55,"snapshot_sha256":"c4d90cf825ddcfd3cba1004625a1138368762355a47e206479ad48dd9d3eca1d","internal_anchors":3},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"b17730ae-9588-4dc6-b9f7-4e6442e750b8"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VgDaES4lUo3Rhvi8gZ6tYJ5+Qt+1rFkNjraZlI/69pfI7YGGEDduUnKlI8Tik3khu7B+lkqUhwJoWIPQcyxKAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T10:06:13.447375Z"},"content_sha256":"f6374b61e28eeb722b1639401bb311dbe8310fbe88b4509e336f0faa4be83c40","schema_version":"1.0","event_id":"sha256:f6374b61e28eeb722b1639401bb311dbe8310fbe88b4509e336f0faa4be83c40"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/QYKS73QP5M7UGDRI3L336YMWZN/bundle.json","state_url":"https://pith.science/pith/QYKS73QP5M7UGDRI3L336YMWZN/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/QYKS73QP5M7UGDRI3L336YMWZN/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-03T10:06:13Z","links":{"resolver":"https://pith.science/pith/QYKS73QP5M7UGDRI3L336YMWZN","bundle":"https://pith.science/pith/QYKS73QP5M7UGDRI3L336YMWZN/bundle.json","state":"https://pith.science/pith/QYKS73QP5M7UGDRI3L336YMWZN/state.json","well_known_bundle":"https://pith.science/.well-known/pith/QYKS73QP5M7UGDRI3L336YMWZN/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:QYKS73QP5M7UGDRI3L336YMWZN","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b982ecb2d05155cecaba5c4f6dbf5b75c53ba230e24fe2c3c536f6699d47585c","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-25T17:33:58Z","title_canon_sha256":"fec6e1ea2e5e1619efb18a52998f2632d6a64d01d7191ace85214bbaa8a0c28c"},"schema_version":"1.0","source":{"id":"2603.24649","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.24649","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"arxiv_version","alias_value":"2603.24649v2","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.24649","created_at":"2026-05-18T02:44:30Z"},{"alias_kind":"pith_short_12","alias_value":"QYKS73QP5M7U","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"QYKS73QP5M7UGDRI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"QYKS73QP","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:f6374b61e28eeb722b1639401bb311dbe8310fbe88b4509e336f0faa4be83c40","target":"graph","created_at":"2026-05-18T02:44:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across evaluated models, final answer-only scoring gives an overly optimistic picture: when answers must also be supported by correct evidence, performance drops substantially on complex workflows."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the automated evidence checking against withheld masks and annotations accurately captures the requirements of real clinical auditing and that the controlled runtime faithfully represents the complexity of operating actual medical imaging software."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MedFlowBench evaluates VLM agents on full radiology and pathology studies by requiring both task answers and verifiable evidence like key slices and regions of interest, revealing that answer-only scores overestimate performance."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Medical agents must navigate full imaging studies and submit auditable evidence, where performance drops sharply compared to answer-only evaluation."}],"snapshot_sha256":"9bbca78cc6a243a03dd1159a72e28c9c1d0c9d1be24d1e1523f7d181b1ce4567"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Medical imaging benchmarks often evaluate VLMs on pre-selected 2D images, slices, crops, or patches, making evaluation closer to visual recognition. Real clinical workflows impose a different burden: readers must search through complete studies, operate imaging software, navigate across slices and magnifications, and document visual evidence that can be audited. We argue that this evidence-producing workflow is a critical missing evaluation axis for medical imaging agents. To study it, we introduce MedFlowBench, a full-study benchmark for VLM agents, together with MedOpenClaw, a controlled and","authors_text":"Benedikt Wiestler, Che Liu, Chengzhi Shen, Daguang Xu, Daniel Rueckert, Jiayuan Zhu, Jiazhen Pan, Jingpei Wu, Junde Wu, Min Xu, Weixiang Shen, Xiao Han, Yanzhu Hu, Yueming Jin, Zongyue Li","cross_cats":[],"headline":"Medical agents must navigate full imaging studies and submit auditable evidence, where performance drops sharply compared to answer-only evaluation.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-25T17:33:58Z","title":"MedOpenClaw and MedFlowBench: Auditing Medical Agents in Full-Study Workflows"},"references":{"count":55,"internal_anchors":3,"resolved_work":55,"sample":[{"cited_arxiv_id":"","doi":"10.1038/sdata.2018.251","is_internal_anchor":false,"ref_index":1,"title":"URLhttps://www.nature.com/articles/sdata2018251","work_id":"7b69c607-ae16-42bd-8ad3-a321ba6fbf5a","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Slake: A semantically- labeled knowledge-enhanced dataset for medical visual question answering","work_id":"04093b1b-9de8-4b51-8f01-acb0fb1e0b10","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Huatuogpt-vision, towards injecting medical visual knowledge into multimodal llms at scale","work_id":"dd32b8a1-4ad4-4155-b031-c317b565c6e7","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Medxpertqa: Benchmarking expert-level medical reasoning and understanding","work_id":"8be80372-d3ea-4961-a7df-4a2807abc590","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Medxpertqa dataset card","work_id":"1c125ac9-c747-49bb-91f8-7f0df325d881","year":2025}],"snapshot_sha256":"c4d90cf825ddcfd3cba1004625a1138368762355a47e206479ad48dd9d3eca1d"},"source":{"id":"2603.24649","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T00:04:26.523916Z","id":"b17730ae-9588-4dc6-b9f7-4e6442e750b8","model_set":{"reader":"grok-4.3"},"one_line_summary":"MedFlowBench evaluates VLM agents on full radiology and pathology studies by requiring both task answers and verifiable evidence like key slices and regions of interest, revealing that answer-only scores overestimate performance.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Medical agents must navigate full imaging studies and submit auditable evidence, where performance drops sharply compared to answer-only evaluation.","strongest_claim":"Across evaluated models, final answer-only scoring gives an overly optimistic picture: when answers must also be supported by correct evidence, performance drops substantially on complex workflows.","weakest_assumption":"That the automated evidence checking against withheld masks and annotations accurately captures the requirements of real clinical auditing and that the controlled runtime faithfully represents the complexity of operating actual medical imaging software."}},"verdict_id":"b17730ae-9588-4dc6-b9f7-4e6442e750b8"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ac5e576b016831de97c2c72616a8fd17b1f72183adad1eb30ec66e2471808eac","target":"record","created_at":"2026-05-18T02:44:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b982ecb2d05155cecaba5c4f6dbf5b75c53ba230e24fe2c3c536f6699d47585c","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-25T17:33:58Z","title_canon_sha256":"fec6e1ea2e5e1619efb18a52998f2632d6a64d01d7191ace85214bbaa8a0c28c"},"schema_version":"1.0","source":{"id":"2603.24649","kind":"arxiv","version":2}},"canonical_sha256":"86152fee0feb3f430e28daf7bf6196cb7a0c1d8a97479e649f107f6ed32915ce","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"86152fee0feb3f430e28daf7bf6196cb7a0c1d8a97479e649f107f6ed32915ce","first_computed_at":"2026-05-18T02:44:30.672619Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:30.672619Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"JIf1NVKu+u5Uu3xHFR8asZQ2MMsPJSc5R0kdmzioBAp5vwbeblgFm7LBKXpZz1RIf1qR24xfJyZiQC8ZwNDTBA==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:30.673104Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.24649","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ac5e576b016831de97c2c72616a8fd17b1f72183adad1eb30ec66e2471808eac","sha256:f6374b61e28eeb722b1639401bb311dbe8310fbe88b4509e336f0faa4be83c40"],"state_sha256":"30caec8d75aa2fb354eca8ea0fcce2b7037f157580eb6f0c0580192f77e829c0"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"lgR2DhK8rwh2YaiUfpO/bbw6SxaGRY74tra0Vg/FHwi9uBcDVR9i4OsT2AQZy9oXxVcb6hsGFyWJYTBkbTyMBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-03T10:06:13.449690Z","bundle_sha256":"f7994f3cfc2d46d4b8f83189893a84e598f020d8f347d6456f75caf978e938e9"}}