{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:KLYE5RSM4VWGTG5O5F6NYV5H2T","short_pith_number":"pith:KLYE5RSM","canonical_record":{"source":{"id":"2605.14167","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T22:41:29Z","cross_cats_sorted":["cs.CY"],"title_canon_sha256":"fb6cd0e18708611d294f82d9479787dd97a031acc511c440a12a78bdf6b1abbe","abstract_canon_sha256":"4d43e96ec2bf5edffbda7ffeb2ba3bf8e09b76bf0aad52231b3e7632afae60f3"},"schema_version":"1.0"},"canonical_sha256":"52f04ec64ce56c699baee97cdc57a7d4cbb79ae60c0b8998ee3696fd1e4c1a1e","source":{"kind":"arxiv","id":"2605.14167","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14167","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14167v1","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14167","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"pith_short_12","alias_value":"KLYE5RSM4VWG","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"KLYE5RSM4VWGTG5O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"KLYE5RSM","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:KLYE5RSM4VWGTG5O5F6NYV5H2T","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14167","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T22:41:29Z","cross_cats_sorted":["cs.CY"],"title_canon_sha256":"fb6cd0e18708611d294f82d9479787dd97a031acc511c440a12a78bdf6b1abbe","abstract_canon_sha256":"4d43e96ec2bf5edffbda7ffeb2ba3bf8e09b76bf0aad52231b3e7632afae60f3"},"schema_version":"1.0"},"canonical_sha256":"52f04ec64ce56c699baee97cdc57a7d4cbb79ae60c0b8998ee3696fd1e4c1a1e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:11.399843Z","signature_b64":"A7PnbFWWicgwNb0lXs6dRvNRwpJncR5eR2edLZRQ3WbYfzfVJnmN+OrQSsRxpyIs6L5HWGziyuCWt44vhb7zCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"52f04ec64ce56c699baee97cdc57a7d4cbb79ae60c0b8998ee3696fd1e4c1a1e","last_reissued_at":"2026-05-17T23:39:11.398338Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:11.398338Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14167","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"G2cvk4XaMmV71m9V8+Eron2St4/8YB1I7njcUuIPa3HbDWpMHbSM5xUJwZW1X+L2hlHBl4i8cT07A9Svz+hwDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-28T17:58:58.688656Z"},"content_sha256":"b9f49d1c3979daaa80f03d0b3219ca3cc2d8c1f8118bda4e4765aaf98314003c","schema_version":"1.0","event_id":"sha256:b9f49d1c3979daaa80f03d0b3219ca3cc2d8c1f8118bda4e4765aaf98314003c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:KLYE5RSM4VWGTG5O5F6NYV5H2T","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"The Evaluation Trap: Benchmark Design as Theoretical Commitment","license":"http://creativecommons.org/licenses/by/4.0/","headline":"AI benchmarks embed unexamined theoretical assumptions that redefine capabilities to match what they can easily measure.","cross_cats":["cs.CY"],"primary_cat":"cs.AI","authors_text":"Theodore J Kalaitzidis","submitted_at":"2026-05-13T22:41:29Z","abstract_excerpt":"Every AI benchmark operationalizes theoretical assumptions about the capability it claims to assess. When assumptions function as unexamined commitments, benchmarks stabilize the dominant paradigm by narrowing what counts as progress. Over time, narrow evaluation reorganizes capability concepts: architectures and definitions are selected for benchmark legibility until evaluation ceases to track an independent object and instead produces a version of the target defined by its own operational assumptions. The result is a trap: evaluation frameworks treat self-reinforcing assessments as valid, bo"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Narrow evaluation reorganizes capability concepts: architectures and definitions are selected for benchmark legibility until evaluation ceases to track an independent object and instead produces a version of the target defined by its own operational assumptions.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That evaluation criteria can be derived directly from technical capability claims in a way that avoids introducing new unexamined assumptions of its own, allowing the audit to reliably discriminate claimed capabilities from proxy behaviors.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"AI benchmarks trap progress by operationalizing assumptions that redefine capabilities around the benchmarks themselves, and Epistematics provides an audit procedure to detect when evaluations cannot discriminate claimed capabilities from proxy behaviors.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"AI benchmarks embed unexamined theoretical assumptions that redefine capabilities to match what they can easily measure.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"32b9255068d00cb6c8ef90b9a557a4369ae6ebbb5c17053e2edff28bb9b28eec"},"source":{"id":"2605.14167","kind":"arxiv","version":1},"verdict":{"id":"d782276e-5709-46bb-8a80-2c32631c3d36","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T04:47:58.038806Z","strongest_claim":"Narrow evaluation reorganizes capability concepts: architectures and definitions are selected for benchmark legibility until evaluation ceases to track an independent object and instead produces a version of the target defined by its own operational assumptions.","one_line_summary":"AI benchmarks trap progress by operationalizing assumptions that redefine capabilities around the benchmarks themselves, and Epistematics provides an audit procedure to detect when evaluations cannot discriminate claimed capabilities from proxy behaviors.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That evaluation criteria can be derived directly from technical capability claims in a way that avoids introducing new unexamined assumptions of its own, allowing the audit to reliably discriminate claimed capabilities from proxy behaviors.","pith_extraction_headline":"AI benchmarks embed unexamined theoretical assumptions that redefine capabilities to match what they can easily measure."},"references":{"count":28,"sample":[{"doi":"","year":null,"title":"Agre, Philip E. , title =","work_id":"aa0c43e3-1fae-4e60-af18-84bef1a7b3a9","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"and Star, Susan Leigh , title =","work_id":"76b62c49-8ca1-46e9-b157-b362ec6484fc","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Cartwright, Nancy , title =","work_id":"97f60656-485f-4a7d-8fbb-4b553fe7c394","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1911,"title":"On the Measure of Intelligence","work_id":"d8980a59-aa48-447b-8852-b7aca2b41b2c","ref_index":4,"cited_arxiv_id":"1911.01547","is_internal_anchor":true},{"doi":"","year":null,"title":"Cognition , volume =","work_id":"6d5b6f1a-69d4-48e5-923e-d0c08d64c2fa","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":28,"snapshot_sha256":"d8b76a68c3b43a8efe93fa301cf8edafb74a7df00c13c26a02ef7620abb942dd","internal_anchors":1},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"d782276e-5709-46bb-8a80-2c32631c3d36"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tMOSmYLDEJnX2kEQj39fE38ajOigRBs5CfcRkpt26mIr3XJ6UWxdeQmx5WWrvPJsecWGIdRfXbV6P0H7lzvOAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-28T17:58:58.689153Z"},"content_sha256":"f730737929748877ce941f4bc27a7bfe06ef3d24ecded798acd6d43647ac1c19","schema_version":"1.0","event_id":"sha256:f730737929748877ce941f4bc27a7bfe06ef3d24ecded798acd6d43647ac1c19"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/KLYE5RSM4VWGTG5O5F6NYV5H2T/bundle.json","state_url":"https://pith.science/pith/KLYE5RSM4VWGTG5O5F6NYV5H2T/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/KLYE5RSM4VWGTG5O5F6NYV5H2T/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-28T17:58:58Z","links":{"resolver":"https://pith.science/pith/KLYE5RSM4VWGTG5O5F6NYV5H2T","bundle":"https://pith.science/pith/KLYE5RSM4VWGTG5O5F6NYV5H2T/bundle.json","state":"https://pith.science/pith/KLYE5RSM4VWGTG5O5F6NYV5H2T/state.json","well_known_bundle":"https://pith.science/.well-known/pith/KLYE5RSM4VWGTG5O5F6NYV5H2T/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:KLYE5RSM4VWGTG5O5F6NYV5H2T","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4d43e96ec2bf5edffbda7ffeb2ba3bf8e09b76bf0aad52231b3e7632afae60f3","cross_cats_sorted":["cs.CY"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T22:41:29Z","title_canon_sha256":"fb6cd0e18708611d294f82d9479787dd97a031acc511c440a12a78bdf6b1abbe"},"schema_version":"1.0","source":{"id":"2605.14167","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14167","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14167v1","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14167","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"pith_short_12","alias_value":"KLYE5RSM4VWG","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"KLYE5RSM4VWGTG5O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"KLYE5RSM","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:f730737929748877ce941f4bc27a7bfe06ef3d24ecded798acd6d43647ac1c19","target":"graph","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Narrow evaluation reorganizes capability concepts: architectures and definitions are selected for benchmark legibility until evaluation ceases to track an independent object and instead produces a version of the target defined by its own operational assumptions."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That evaluation criteria can be derived directly from technical capability claims in a way that avoids introducing new unexamined assumptions of its own, allowing the audit to reliably discriminate claimed capabilities from proxy behaviors."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"AI benchmarks trap progress by operationalizing assumptions that redefine capabilities around the benchmarks themselves, and Epistematics provides an audit procedure to detect when evaluations cannot discriminate claimed capabilities from proxy behaviors."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"AI benchmarks embed unexamined theoretical assumptions that redefine capabilities to match what they can easily measure."}],"snapshot_sha256":"32b9255068d00cb6c8ef90b9a557a4369ae6ebbb5c17053e2edff28bb9b28eec"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Every AI benchmark operationalizes theoretical assumptions about the capability it claims to assess. When assumptions function as unexamined commitments, benchmarks stabilize the dominant paradigm by narrowing what counts as progress. Over time, narrow evaluation reorganizes capability concepts: architectures and definitions are selected for benchmark legibility until evaluation ceases to track an independent object and instead produces a version of the target defined by its own operational assumptions. The result is a trap: evaluation frameworks treat self-reinforcing assessments as valid, bo","authors_text":"Theodore J Kalaitzidis","cross_cats":["cs.CY"],"headline":"AI benchmarks embed unexamined theoretical assumptions that redefine capabilities to match what they can easily measure.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T22:41:29Z","title":"The Evaluation Trap: Benchmark Design as Theoretical Commitment"},"references":{"count":28,"internal_anchors":1,"resolved_work":28,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Agre, Philip E. , title =","work_id":"aa0c43e3-1fae-4e60-af18-84bef1a7b3a9","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"and Star, Susan Leigh , title =","work_id":"76b62c49-8ca1-46e9-b157-b362ec6484fc","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Cartwright, Nancy , title =","work_id":"97f60656-485f-4a7d-8fbb-4b553fe7c394","year":null},{"cited_arxiv_id":"1911.01547","doi":"","is_internal_anchor":true,"ref_index":4,"title":"On the Measure of Intelligence","work_id":"d8980a59-aa48-447b-8852-b7aca2b41b2c","year":1911},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Cognition , volume =","work_id":"6d5b6f1a-69d4-48e5-923e-d0c08d64c2fa","year":null}],"snapshot_sha256":"d8b76a68c3b43a8efe93fa301cf8edafb74a7df00c13c26a02ef7620abb942dd"},"source":{"id":"2605.14167","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T04:47:58.038806Z","id":"d782276e-5709-46bb-8a80-2c32631c3d36","model_set":{"reader":"grok-4.3"},"one_line_summary":"AI benchmarks trap progress by operationalizing assumptions that redefine capabilities around the benchmarks themselves, and Epistematics provides an audit procedure to detect when evaluations cannot discriminate claimed capabilities from proxy behaviors.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"AI benchmarks embed unexamined theoretical assumptions that redefine capabilities to match what they can easily measure.","strongest_claim":"Narrow evaluation reorganizes capability concepts: architectures and definitions are selected for benchmark legibility until evaluation ceases to track an independent object and instead produces a version of the target defined by its own operational assumptions.","weakest_assumption":"That evaluation criteria can be derived directly from technical capability claims in a way that avoids introducing new unexamined assumptions of its own, allowing the audit to reliably discriminate claimed capabilities from proxy behaviors."}},"verdict_id":"d782276e-5709-46bb-8a80-2c32631c3d36"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b9f49d1c3979daaa80f03d0b3219ca3cc2d8c1f8118bda4e4765aaf98314003c","target":"record","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4d43e96ec2bf5edffbda7ffeb2ba3bf8e09b76bf0aad52231b3e7632afae60f3","cross_cats_sorted":["cs.CY"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T22:41:29Z","title_canon_sha256":"fb6cd0e18708611d294f82d9479787dd97a031acc511c440a12a78bdf6b1abbe"},"schema_version":"1.0","source":{"id":"2605.14167","kind":"arxiv","version":1}},"canonical_sha256":"52f04ec64ce56c699baee97cdc57a7d4cbb79ae60c0b8998ee3696fd1e4c1a1e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"52f04ec64ce56c699baee97cdc57a7d4cbb79ae60c0b8998ee3696fd1e4c1a1e","first_computed_at":"2026-05-17T23:39:11.398338Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:11.398338Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"A7PnbFWWicgwNb0lXs6dRvNRwpJncR5eR2edLZRQ3WbYfzfVJnmN+OrQSsRxpyIs6L5HWGziyuCWt44vhb7zCg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:11.399843Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14167","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b9f49d1c3979daaa80f03d0b3219ca3cc2d8c1f8118bda4e4765aaf98314003c","sha256:f730737929748877ce941f4bc27a7bfe06ef3d24ecded798acd6d43647ac1c19"],"state_sha256":"ca380b55f3531185440e7f1d91710ab578c464bab0cefb67bafa82a54ce5b6be"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"AjIHlCQCWF+BkyKeNAsYlmpDZS0Yj/ZO5xjUutRp5E3rmbHdR6AcFTW61HAmx0uVSybGKTfB5U7g7CV8ltYABA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-28T17:58:58.691429Z","bundle_sha256":"733671d0083860f6c14f454b7989f2d9062bf7b019bf72d36d6ce268bf385e15"}}