{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:73VCVUBWDUMDDM2U3NAJBAPBZH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"e09a6c3573b14f6c0d91cfe480768a67a2bdca92801dd59e864ee5f6d25e3e99","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-18T11:57:09Z","title_canon_sha256":"a2da680b0b94035cc84356880c0da6cb74af656d8733a4535c8129caebe0bf5b"},"schema_version":"1.0","source":{"id":"2601.12369","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.12369","created_at":"2026-05-20T01:05:06Z"},{"alias_kind":"arxiv_version","alias_value":"2601.12369v4","created_at":"2026-05-20T01:05:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.12369","created_at":"2026-05-20T01:05:06Z"},{"alias_kind":"pith_short_12","alias_value":"73VCVUBWDUMD","created_at":"2026-05-20T01:05:06Z"},{"alias_kind":"pith_short_16","alias_value":"73VCVUBWDUMDDM2U","created_at":"2026-05-20T01:05:06Z"},{"alias_kind":"pith_short_8","alias_value":"73VCVUBW","created_at":"2026-05-20T01:05:06Z"}],"graph_snapshots":[{"event_id":"sha256:4a478ade63f02efd0ba6c3cb4e93e383e6e6b314a56bb916f7b0e0d2691d3e54","target":"graph","created_at":"2026-05-20T01:05:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Evaluating 7 Deep Research Agents and 12 frontier LLMs reveals a dual bottleneck: capability-side, the best agent retrieves only 20.92% of expert-cited papers, and 1,000 model taxonomies show 75.9% sibling overlap, 51.2% MECE violations, and 83.4% structural imbalance, all detectable without any reference; alignment-side, all 12 LLMs converge to Sem-Path 28--29%, well below 47--58% achieved by three independent human-annotator groups on the same paper sets."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"Expert-authored taxonomies constitute an appropriate and stable gold standard against which model outputs can be meaningfully compared, and the newly introduced metrics (US-TED, US-NTED, Sem-Path) validly quantify synthesis quality independent of any single reference taxonomy."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"TaxoBench shows deep research agents retrieve 20.92% of expert-cited papers and produce taxonomies with 75.9% sibling overlap, 51.2% MECE violations, and 83.4% imbalance, while LLMs reach only 28-29% semantic path similarity versus 47-58% for human groups."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Deep research agents retrieve only 21 percent of expert-cited papers and organize taxonomies far below human alignment levels."}],"snapshot_sha256":"6f41914ad99d6bc6ddfe8c2dd1f05a9e782bbab4c3321dc1c80ae3cc433b6320"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0f01c25fe3e4a38403ff73590bb593dde9acf9f4b3dd496ef1337edf93901504"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2601.12369/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Deep Research Agents increasingly automate survey generation, yet whether they match human experts at retrieving essential papers and organizing them into expert-like taxonomies remains unclear. Existing benchmarks emphasize writing quality or citation correctness, while standard clustering metrics ignore hierarchical structure. We introduce TaxoBench, a benchmark of 72 highly cited LLM surveys with expert-authored taxonomy trees and 3,815 papers mapped to paper categories. TaxoBench evaluates (1) retrieval via Recall/Precision/F1, and (2) organization at a leaf level (paper-to-category assign","authors_text":"Jiabao Zhuang, Jiahui Lin, Jingyi Deng, Kexin Tan, Long Ma, Maxm Pan, Mingqi Wu, Ming Zhang, Ning Luo, Qiyuan Peng, Qi Zhang, Renzhe Zheng, Shihan Dou, Tao Gui, Wenqing Jing, Xuanjing Huang, Yuhang Zhao, Yuhui Wang, Yujiong Shen, Zhenghao Xiang, Ziyu Kong","cross_cats":[],"headline":"Deep research agents retrieve only 21 percent of expert-cited papers and organize taxonomies far below human alignment levels.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-18T11:57:09Z","title":"Can Deep Research Agents Retrieve and Organize? Evaluating the Synthesis Gap with Expert Taxonomies"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.12369","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T13:28:44.482992Z","id":"f365c0ef-6b16-4618-b18d-7d1856e78a28","model_set":{"reader":"grok-4.3"},"one_line_summary":"TaxoBench shows deep research agents retrieve 20.92% of expert-cited papers and produce taxonomies with 75.9% sibling overlap, 51.2% MECE violations, and 83.4% imbalance, while LLMs reach only 28-29% semantic path similarity versus 47-58% for human groups.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Deep research agents retrieve only 21 percent of expert-cited papers and organize taxonomies far below human alignment levels.","strongest_claim":"Evaluating 7 Deep Research Agents and 12 frontier LLMs reveals a dual bottleneck: capability-side, the best agent retrieves only 20.92% of expert-cited papers, and 1,000 model taxonomies show 75.9% sibling overlap, 51.2% MECE violations, and 83.4% structural imbalance, all detectable without any reference; alignment-side, all 12 LLMs converge to Sem-Path 28--29%, well below 47--58% achieved by three independent human-annotator groups on the same paper sets.","weakest_assumption":"Expert-authored taxonomies constitute an appropriate and stable gold standard against which model outputs can be meaningfully compared, and the newly introduced metrics (US-TED, US-NTED, Sem-Path) validly quantify synthesis quality independent of any single reference taxonomy."}},"verdict_id":"f365c0ef-6b16-4618-b18d-7d1856e78a28"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1b5c1ae8dac9e57082ae86ee6dd2db4e584994975b76894ab688abf032d15297","target":"record","created_at":"2026-05-20T01:05:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"e09a6c3573b14f6c0d91cfe480768a67a2bdca92801dd59e864ee5f6d25e3e99","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-18T11:57:09Z","title_canon_sha256":"a2da680b0b94035cc84356880c0da6cb74af656d8733a4535c8129caebe0bf5b"},"schema_version":"1.0","source":{"id":"2601.12369","kind":"arxiv","version":4}},"canonical_sha256":"feea2ad0361d1831b354db409081e1c9cd2b1e64caaca074ea42f148c04cf851","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"feea2ad0361d1831b354db409081e1c9cd2b1e64caaca074ea42f148c04cf851","first_computed_at":"2026-05-20T01:05:06.339939Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:06.339939Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"R3mRbxjfckYonczXN9GBI/4IdTJm9efrJtuqpZJN8pDd980lb7U6S/ueOh5z6wzZ5pE5ALLfqE2yQx+sIwZ8Bg==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:06.340842Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.12369","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1b5c1ae8dac9e57082ae86ee6dd2db4e584994975b76894ab688abf032d15297","sha256:4a478ade63f02efd0ba6c3cb4e93e383e6e6b314a56bb916f7b0e0d2691d3e54"],"state_sha256":"466e70b76ee2b690e24cf87fa06190637f76f7bbb0887688458aab240b60af0c"}