{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:FO7JATZRJTFKMYD4DF5YMZPUCL","short_pith_number":"pith:FO7JATZR","schema_version":"1.0","canonical_sha256":"2bbe904f314ccaa6607c197b8665f412e7bdb88c9cae630afdcf89d7b50104f1","source":{"kind":"arxiv","id":"2606.14397","version":2},"attestation_state":"computed","paper":{"title":"Running the Gauntlet: Re-evaluating the Capabilities of Agents Beyond Familiar Environments","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Adam Mahdi, Adel Bibi, Arkadiusz Drohomirecki, Baoyuan Wu, Chris Russell, Christopher Summerfield, Damian Rynczak, Fazl Barez, Grzegorz Biziel, Guohao Li, Hanna Yershova, Kai Rawal, Kumail Alhamoud, Michal Zakrzewski, Mykola Vysotskyi, Philip Torr, Runqi Lin, Sebastian Montagna, Shreyansh Padarha, Taras Rumezhak, Volodymyr Karpiv, William Lugoloobi, Xander Davies, Yarin Gal, Zihao Fu","submitted_at":"2026-06-12T12:32:24Z","abstract_excerpt":"As agentic systems continue to evolve and are widely deployed in real-world scenarios, there is a growing demand to faithfully evaluate their capabilities. However, current benchmarks are typically built on popular applications with relatively simple tasks and focus on a narrow set of capabilities while overlooking broader dimensions, resulting in saturated performance on modern agents and failing to probe their limitations. To this end, we introduce GauntletBench, a web-based benchmark for evaluating agent generalisation in challenging scenarios, focusing on three underexplored capabilities ("},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.14397","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-12T12:32:24Z","cross_cats_sorted":[],"title_canon_sha256":"56cb9ed1841154eb10f5f6770d450227212f5a2bf148479fd89a998e926d1ff4","abstract_canon_sha256":"b6dc9c773d678ffc9abf9bf8d9c04a2b5860fd03e2d9cb092d2968e58ab1b588"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-26T01:15:54.463858Z","signature_b64":"UAogOmJnRJIxfX/dC8AT7u9jNi7bbnK1Qtu2ISY6CHvIgaTINzUBbliAq7C+UMhaCRXUWaVdm3I78oP2iGs8DA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2bbe904f314ccaa6607c197b8665f412e7bdb88c9cae630afdcf89d7b50104f1","last_reissued_at":"2026-06-26T01:15:54.463363Z","signature_status":"signed_v1","first_computed_at":"2026-06-26T01:15:54.463363Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Running the Gauntlet: Re-evaluating the Capabilities of Agents Beyond Familiar Environments","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Adam Mahdi, Adel Bibi, Arkadiusz Drohomirecki, Baoyuan Wu, Chris Russell, Christopher Summerfield, Damian Rynczak, Fazl Barez, Grzegorz Biziel, Guohao Li, Hanna Yershova, Kai Rawal, Kumail Alhamoud, Michal Zakrzewski, Mykola Vysotskyi, Philip Torr, Runqi Lin, Sebastian Montagna, Shreyansh Padarha, Taras Rumezhak, Volodymyr Karpiv, William Lugoloobi, Xander Davies, Yarin Gal, Zihao Fu","submitted_at":"2026-06-12T12:32:24Z","abstract_excerpt":"As agentic systems continue to evolve and are widely deployed in real-world scenarios, there is a growing demand to faithfully evaluate their capabilities. However, current benchmarks are typically built on popular applications with relatively simple tasks and focus on a narrow set of capabilities while overlooking broader dimensions, resulting in saturated performance on modern agents and failing to probe their limitations. To this end, we introduce GauntletBench, a web-based benchmark for evaluating agent generalisation in challenging scenarios, focusing on three underexplored capabilities ("},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.14397","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.14397/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.14397","created_at":"2026-06-26T01:15:54.463417+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.14397v2","created_at":"2026-06-26T01:15:54.463417+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.14397","created_at":"2026-06-26T01:15:54.463417+00:00"},{"alias_kind":"pith_short_12","alias_value":"FO7JATZRJTFK","created_at":"2026-06-26T01:15:54.463417+00:00"},{"alias_kind":"pith_short_16","alias_value":"FO7JATZRJTFKMYD4","created_at":"2026-06-26T01:15:54.463417+00:00"},{"alias_kind":"pith_short_8","alias_value":"FO7JATZR","created_at":"2026-06-26T01:15:54.463417+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL","json":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL.json","graph_json":"https://pith.science/api/pith-number/FO7JATZRJTFKMYD4DF5YMZPUCL/graph.json","events_json":"https://pith.science/api/pith-number/FO7JATZRJTFKMYD4DF5YMZPUCL/events.json","paper":"https://pith.science/paper/FO7JATZR"},"agent_actions":{"view_html":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL","download_json":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL.json","view_paper":"https://pith.science/paper/FO7JATZR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.14397&json=true","fetch_graph":"https://pith.science/api/pith-number/FO7JATZRJTFKMYD4DF5YMZPUCL/graph.json","fetch_events":"https://pith.science/api/pith-number/FO7JATZRJTFKMYD4DF5YMZPUCL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL/action/storage_attestation","attest_author":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL/action/author_attestation","sign_citation":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL/action/citation_signature","submit_replication":"https://pith.science/pith/FO7JATZRJTFKMYD4DF5YMZPUCL/action/replication_record"}},"created_at":"2026-06-26T01:15:54.463417+00:00","updated_at":"2026-06-26T01:15:54.463417+00:00"}