{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:CJLLDCTGH6ZKA46QJKHD6U34WH","short_pith_number":"pith:CJLLDCTG","schema_version":"1.0","canonical_sha256":"1256b18a663fb2a073d04a8e3f537cb1f810feba2498c99f7c0440e2abf08cdd","source":{"kind":"arxiv","id":"2605.23629","version":1},"attestation_state":"computed","paper":{"title":"DDX-TRACE: A Benchmark for Medical Diagnostic Trajectories in VLMs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Benedikt Wiestler, Daniel Rueckert, Felix Bitzer, Jiancheng Yang, Jiazhen Pan, Julian Canisius, Jun Li, Paula Ro{\\ss}m\\\"uller, Virginie Kreutzinger, Weixiang Shen","submitted_at":"2026-05-22T13:41:10Z","abstract_excerpt":"Medical diagnosis is not a single prediction from a fully specified vignette. It is a sequential workup: clinicians decide what evidence to obtain, revise a differential diagnosis, and stop when the diagnosis is sufficiently supported. Most medical AI benchmarks instead reveal the relevant context upfront and score only the final answer, making unsupported correct guesses, premature closure, inefficient workups, and poor uncertainty updating invisible. We introduce DDX-TRACE, a physician-adjudicated benchmark for multimodal neuroradiology that evaluates diagnostic trajectories under hidden evi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.23629","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-22T13:41:10Z","cross_cats_sorted":[],"title_canon_sha256":"6f6405986432e0ecf97aa36211603da48866c89d6bdccf3dae9fc43bca9fdc54","abstract_canon_sha256":"32844bef8f94aafae56631740af8bf775e2e911af02bb06d5a01065736c98c98"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-25T02:02:22.845760Z","signature_b64":"4DSA+TDx0ub9hQCdzm/p0cERfr1/NC7UjZWeJ2nGPBUDvXhZHtXqfoUya4Px26nfLT1SgEbYPpdujuoKeq1YBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1256b18a663fb2a073d04a8e3f537cb1f810feba2498c99f7c0440e2abf08cdd","last_reissued_at":"2026-05-25T02:02:22.844975Z","signature_status":"signed_v1","first_computed_at":"2026-05-25T02:02:22.844975Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"DDX-TRACE: A Benchmark for Medical Diagnostic Trajectories in VLMs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Benedikt Wiestler, Daniel Rueckert, Felix Bitzer, Jiancheng Yang, Jiazhen Pan, Julian Canisius, Jun Li, Paula Ro{\\ss}m\\\"uller, Virginie Kreutzinger, Weixiang Shen","submitted_at":"2026-05-22T13:41:10Z","abstract_excerpt":"Medical diagnosis is not a single prediction from a fully specified vignette. It is a sequential workup: clinicians decide what evidence to obtain, revise a differential diagnosis, and stop when the diagnosis is sufficiently supported. Most medical AI benchmarks instead reveal the relevant context upfront and score only the final answer, making unsupported correct guesses, premature closure, inefficient workups, and poor uncertainty updating invisible. We introduce DDX-TRACE, a physician-adjudicated benchmark for multimodal neuroradiology that evaluates diagnostic trajectories under hidden evi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.23629","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.23629/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.23629","created_at":"2026-05-25T02:02:22.845102+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.23629v1","created_at":"2026-05-25T02:02:22.845102+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.23629","created_at":"2026-05-25T02:02:22.845102+00:00"},{"alias_kind":"pith_short_12","alias_value":"CJLLDCTGH6ZK","created_at":"2026-05-25T02:02:22.845102+00:00"},{"alias_kind":"pith_short_16","alias_value":"CJLLDCTGH6ZKA46Q","created_at":"2026-05-25T02:02:22.845102+00:00"},{"alias_kind":"pith_short_8","alias_value":"CJLLDCTG","created_at":"2026-05-25T02:02:22.845102+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH","json":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH.json","graph_json":"https://pith.science/api/pith-number/CJLLDCTGH6ZKA46QJKHD6U34WH/graph.json","events_json":"https://pith.science/api/pith-number/CJLLDCTGH6ZKA46QJKHD6U34WH/events.json","paper":"https://pith.science/paper/CJLLDCTG"},"agent_actions":{"view_html":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH","download_json":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH.json","view_paper":"https://pith.science/paper/CJLLDCTG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.23629&json=true","fetch_graph":"https://pith.science/api/pith-number/CJLLDCTGH6ZKA46QJKHD6U34WH/graph.json","fetch_events":"https://pith.science/api/pith-number/CJLLDCTGH6ZKA46QJKHD6U34WH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH/action/storage_attestation","attest_author":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH/action/author_attestation","sign_citation":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH/action/citation_signature","submit_replication":"https://pith.science/pith/CJLLDCTGH6ZKA46QJKHD6U34WH/action/replication_record"}},"created_at":"2026-05-25T02:02:22.845102+00:00","updated_at":"2026-05-25T02:02:22.845102+00:00"}