{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:UPQH2SVHWS72PX3SAOIY7JORZ5","short_pith_number":"pith:UPQH2SVH","schema_version":"1.0","canonical_sha256":"a3e07d4aa7b4bfa7df7203918fa5d1cf70cfa1a941dd8f7f58989df9325f9beb","source":{"kind":"arxiv","id":"2601.06676","version":2},"attestation_state":"computed","paper":{"title":"One Interaction Is Worth a Thousand Guesses: Benchmarking the Interactive Capabilities of Deep Research Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.HC"],"primary_cat":"cs.CL","authors_text":"Anthony K. H. Tung, Jun Yu, Qiang Huang, Wei Chen, Xiaoya Xie, Yingchaojie Feng, Zhaorui Yang","submitted_at":"2026-01-10T20:29:12Z","abstract_excerpt":"Deep research agents powered by Large Language Models (LLMs) can perform multi-step reasoning, web exploration, and long-form report generation. However, existing systems remain largely autonomous, assuming fully specified user intent and evaluating only final outputs. In practice, research goals are often underspecified and evolve during exploration, yet current benchmarks neither model dynamic user feedback nor measure interaction costs. To address this gap, we introduce IDRBench, the first Interactive Deep Research Benchmark for systematically evaluating the interactive capabilities of deep"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.06676","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-10T20:29:12Z","cross_cats_sorted":["cs.AI","cs.HC"],"title_canon_sha256":"c885979338c39f00185fe0433cb8f6fa6a2f58ac68939a0c6ee62b2fe58be573","abstract_canon_sha256":"436580e76e1f0d86a483802ee05c2149abebab5205adf0220be6e444f49e0063"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T02:12:45.840920Z","signature_b64":"GRYHj6z/Z2RZ5AyBCiWxPirubjpv64pYZNFVtnZp7HSHmU+Ef3yogNBvFrMMw7lTU4YBeqaTCRl3vu+TgkONCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a3e07d4aa7b4bfa7df7203918fa5d1cf70cfa1a941dd8f7f58989df9325f9beb","last_reissued_at":"2026-06-23T02:12:45.840436Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T02:12:45.840436Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"One Interaction Is Worth a Thousand Guesses: Benchmarking the Interactive Capabilities of Deep Research Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.HC"],"primary_cat":"cs.CL","authors_text":"Anthony K. H. Tung, Jun Yu, Qiang Huang, Wei Chen, Xiaoya Xie, Yingchaojie Feng, Zhaorui Yang","submitted_at":"2026-01-10T20:29:12Z","abstract_excerpt":"Deep research agents powered by Large Language Models (LLMs) can perform multi-step reasoning, web exploration, and long-form report generation. However, existing systems remain largely autonomous, assuming fully specified user intent and evaluating only final outputs. In practice, research goals are often underspecified and evolve during exploration, yet current benchmarks neither model dynamic user feedback nor measure interaction costs. To address this gap, we introduce IDRBench, the first Interactive Deep Research Benchmark for systematically evaluating the interactive capabilities of deep"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.06676","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.06676/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.06676","created_at":"2026-06-23T02:12:45.840498+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.06676v2","created_at":"2026-06-23T02:12:45.840498+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.06676","created_at":"2026-06-23T02:12:45.840498+00:00"},{"alias_kind":"pith_short_12","alias_value":"UPQH2SVHWS72","created_at":"2026-06-23T02:12:45.840498+00:00"},{"alias_kind":"pith_short_16","alias_value":"UPQH2SVHWS72PX3S","created_at":"2026-06-23T02:12:45.840498+00:00"},{"alias_kind":"pith_short_8","alias_value":"UPQH2SVH","created_at":"2026-06-23T02:12:45.840498+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.18661","citing_title":"AI for Auto-Research: Roadmap & User Guide","ref_index":40,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5","json":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5.json","graph_json":"https://pith.science/api/pith-number/UPQH2SVHWS72PX3SAOIY7JORZ5/graph.json","events_json":"https://pith.science/api/pith-number/UPQH2SVHWS72PX3SAOIY7JORZ5/events.json","paper":"https://pith.science/paper/UPQH2SVH"},"agent_actions":{"view_html":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5","download_json":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5.json","view_paper":"https://pith.science/paper/UPQH2SVH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.06676&json=true","fetch_graph":"https://pith.science/api/pith-number/UPQH2SVHWS72PX3SAOIY7JORZ5/graph.json","fetch_events":"https://pith.science/api/pith-number/UPQH2SVHWS72PX3SAOIY7JORZ5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5/action/storage_attestation","attest_author":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5/action/author_attestation","sign_citation":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5/action/citation_signature","submit_replication":"https://pith.science/pith/UPQH2SVHWS72PX3SAOIY7JORZ5/action/replication_record"}},"created_at":"2026-06-23T02:12:45.840498+00:00","updated_at":"2026-06-23T02:12:45.840498+00:00"}