{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:366XJWZLCBUHTHUFWJHINIHT5Z","short_pith_number":"pith:366XJWZL","schema_version":"1.0","canonical_sha256":"dfbd74db2b1068799e85b24e86a0f3ee781c9b3d8f0481bd796860b90f145920","source":{"kind":"arxiv","id":"2606.29537","version":1},"attestation_state":"computed","paper":{"title":"OSWorld2.0: Benchmarking Computer Use Agents on Long-Horizon Real-World Tasks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Alex Su, Bowen Wang, Boyuan Zheng, Cheng Chen, Dayiheng Liu, Dunjie Lu, Frederic Sala, Haikong Lu, Haoyuan Wu, Hao Zou, Jiamin Song, Jiaqi Deng, Jiayang Sun, Junyang Lin, Kaiqian Cui, Manpreet Kaur, Mengqi Yuan, Peng Qi, Qi Zhen, Saaket Agashe, Siva Reddy, Tao Yu, Tianbao Xie, Vincent Sunn Chen, Weiming Wu, Xiao Yu, Xin Eric Wang, Xing Han Lu, Xinyuan Wang, Xinzhuang Xiong, Yitong Li, Yuhao Yang, Yu Su, Zhengyang Qi, Zhou Yu, Zilong Zhou","submitted_at":"2026-06-28T17:59:17Z","abstract_excerpt":"Existing computer-use benchmarks fail to capture the realism, complexity, and long-horizon demands of real-world computer use, limiting their ability to reveal the limitations of frontier agents. We introduce OSWorld 2.0, a benchmark of 108 long-horizon computer-use workflows across everyday and professional tasks, designed to capture complex and challenging real-world phenomena. Each task represents a realistic end-to-end workflow that takes human users a median of about 1.6 hours to complete and requires an average of 318 tool calls with Claude Opus 4.7 using maximum thinking, compared with "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.29537","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-28T17:59:17Z","cross_cats_sorted":[],"title_canon_sha256":"64ee39f18891dda70a1c2616919325f386c1bdcbec15cb4e328a821d53a317b5","abstract_canon_sha256":"2fc77f2e5f8279b38eef24ba449811155bd8b55035e794ed1da558ae5aec16c6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T01:18:11.015895Z","signature_b64":"Fn4aGlLh/5TQrodcincYICiVrMW64kETZ4CUy9yiuv3HyIgCXU0nKd4Tc3WZKpXupHh8H//ugzuPpSyse9dJAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dfbd74db2b1068799e85b24e86a0f3ee781c9b3d8f0481bd796860b90f145920","last_reissued_at":"2026-06-30T01:18:11.015300Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T01:18:11.015300Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"OSWorld2.0: Benchmarking Computer Use Agents on Long-Horizon Real-World Tasks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Alex Su, Bowen Wang, Boyuan Zheng, Cheng Chen, Dayiheng Liu, Dunjie Lu, Frederic Sala, Haikong Lu, Haoyuan Wu, Hao Zou, Jiamin Song, Jiaqi Deng, Jiayang Sun, Junyang Lin, Kaiqian Cui, Manpreet Kaur, Mengqi Yuan, Peng Qi, Qi Zhen, Saaket Agashe, Siva Reddy, Tao Yu, Tianbao Xie, Vincent Sunn Chen, Weiming Wu, Xiao Yu, Xin Eric Wang, Xing Han Lu, Xinyuan Wang, Xinzhuang Xiong, Yitong Li, Yuhao Yang, Yu Su, Zhengyang Qi, Zhou Yu, Zilong Zhou","submitted_at":"2026-06-28T17:59:17Z","abstract_excerpt":"Existing computer-use benchmarks fail to capture the realism, complexity, and long-horizon demands of real-world computer use, limiting their ability to reveal the limitations of frontier agents. We introduce OSWorld 2.0, a benchmark of 108 long-horizon computer-use workflows across everyday and professional tasks, designed to capture complex and challenging real-world phenomena. Each task represents a realistic end-to-end workflow that takes human users a median of about 1.6 hours to complete and requires an average of 318 tool calls with Claude Opus 4.7 using maximum thinking, compared with "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.29537","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.29537/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.29537","created_at":"2026-06-30T01:18:11.015386+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.29537v1","created_at":"2026-06-30T01:18:11.015386+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.29537","created_at":"2026-06-30T01:18:11.015386+00:00"},{"alias_kind":"pith_short_12","alias_value":"366XJWZLCBUH","created_at":"2026-06-30T01:18:11.015386+00:00"},{"alias_kind":"pith_short_16","alias_value":"366XJWZLCBUHTHUF","created_at":"2026-06-30T01:18:11.015386+00:00"},{"alias_kind":"pith_short_8","alias_value":"366XJWZL","created_at":"2026-06-30T01:18:11.015386+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z","json":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z.json","graph_json":"https://pith.science/api/pith-number/366XJWZLCBUHTHUFWJHINIHT5Z/graph.json","events_json":"https://pith.science/api/pith-number/366XJWZLCBUHTHUFWJHINIHT5Z/events.json","paper":"https://pith.science/paper/366XJWZL"},"agent_actions":{"view_html":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z","download_json":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z.json","view_paper":"https://pith.science/paper/366XJWZL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.29537&json=true","fetch_graph":"https://pith.science/api/pith-number/366XJWZLCBUHTHUFWJHINIHT5Z/graph.json","fetch_events":"https://pith.science/api/pith-number/366XJWZLCBUHTHUFWJHINIHT5Z/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z/action/timestamp_anchor","attest_storage":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z/action/storage_attestation","attest_author":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z/action/author_attestation","sign_citation":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z/action/citation_signature","submit_replication":"https://pith.science/pith/366XJWZLCBUHTHUFWJHINIHT5Z/action/replication_record"}},"created_at":"2026-06-30T01:18:11.015386+00:00","updated_at":"2026-06-30T01:18:11.015386+00:00"}