{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:23LIPQ3BTXVB3JPQKK4A4YYOA6","short_pith_number":"pith:23LIPQ3B","canonical_record":{"source":{"id":"2602.02898","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-02-02T23:11:09Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"7d77d6ce1829ade14ca7c7785c86af58e432282f40b3bc8a131a4d272db84adb","abstract_canon_sha256":"4f0b3fa9c1d64d5131697e6d3c75b8895e5e75e463a805a08310ca2fe5563334"},"schema_version":"1.0"},"canonical_sha256":"d6d687c3619dea1da5f052b80e630e07bf2b1061f67f27e49bc0df0b193988de","source":{"kind":"arxiv","id":"2602.02898","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.02898","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"arxiv_version","alias_value":"2602.02898v2","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.02898","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_12","alias_value":"23LIPQ3BTXVB","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_16","alias_value":"23LIPQ3BTXVB3JPQ","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_8","alias_value":"23LIPQ3B","created_at":"2026-05-28T01:04:36Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:23LIPQ3BTXVB3JPQKK4A4YYOA6","target":"record","payload":{"canonical_record":{"source":{"id":"2602.02898","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-02-02T23:11:09Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"7d77d6ce1829ade14ca7c7785c86af58e432282f40b3bc8a131a4d272db84adb","abstract_canon_sha256":"4f0b3fa9c1d64d5131697e6d3c75b8895e5e75e463a805a08310ca2fe5563334"},"schema_version":"1.0"},"canonical_sha256":"d6d687c3619dea1da5f052b80e630e07bf2b1061f67f27e49bc0df0b193988de","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T01:04:36.228126Z","signature_b64":"PPpcDz8aq42dYeBzT1xpyHiXwI8Cx3hKTyGsD1f17k3HkLQi6JVS+ubYDFXbcAq670dlaImlZNlCAXwy73ViBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d6d687c3619dea1da5f052b80e630e07bf2b1061f67f27e49bc0df0b193988de","last_reissued_at":"2026-05-28T01:04:36.227719Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T01:04:36.227719Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.02898","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T01:04:36Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"f9g20fgK3cFfJSEywLzOSt/UgrvTsPc7M6YSA8OAJj492HszGsdOvSrjpIZiEPzj2Dp8OqeijizumypYRmw9AA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T19:34:47.518920Z"},"content_sha256":"dfbd3823f33fc8d75fd1afdcdbbe6ce0466b9d2d9cc3e6dbd2499b171124e2ec","schema_version":"1.0","event_id":"sha256:dfbd3823f33fc8d75fd1afdcdbbe6ce0466b9d2d9cc3e6dbd2499b171124e2ec"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:23LIPQ3BTXVB3JPQKK4A4YYOA6","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Aligning Language Model Benchmarks with Pairwise Preferences","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Ahmed Alaa, Hannah Cyberey, Jonathan Richard Schwarz, Marco Gutierrez, Thomas Hartvigsen, Xinyi Leng","submitted_at":"2026-02-02T23:11:09Z","abstract_excerpt":"Language model benchmarks are pervasive and computationally-efficient proxies for real-world performance. However, many recent works find that benchmarks often fail to predict real utility. Towards bridging this gap, we introduce benchmark alignment, where we use limited amounts of information about model performance to automatically update offline benchmarks, aiming to produce new static benchmarks that predict model pairwise preferences in given test settings. We then propose BenchAlign, the first solution to this problem, which learns preference-aligned weight- ings for benchmark questions "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.02898","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.02898/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T01:04:36Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4JHBFU4dxwkit+Bpk/mM7KiozvSeuLP5XjD1vE2rkspxFZlRfsqaquNhGubL7+iHKUYiWbn+EzIJe+O8IkwyAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T19:34:47.519294Z"},"content_sha256":"d1f1d7dc6b7908e3e8aa16a5009d9a0284ccc72a129db873d074ab6180b16e31","schema_version":"1.0","event_id":"sha256:d1f1d7dc6b7908e3e8aa16a5009d9a0284ccc72a129db873d074ab6180b16e31"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/23LIPQ3BTXVB3JPQKK4A4YYOA6/bundle.json","state_url":"https://pith.science/pith/23LIPQ3BTXVB3JPQKK4A4YYOA6/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/23LIPQ3BTXVB3JPQKK4A4YYOA6/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-04T19:34:47Z","links":{"resolver":"https://pith.science/pith/23LIPQ3BTXVB3JPQKK4A4YYOA6","bundle":"https://pith.science/pith/23LIPQ3BTXVB3JPQKK4A4YYOA6/bundle.json","state":"https://pith.science/pith/23LIPQ3BTXVB3JPQKK4A4YYOA6/state.json","well_known_bundle":"https://pith.science/.well-known/pith/23LIPQ3BTXVB3JPQKK4A4YYOA6/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:23LIPQ3BTXVB3JPQKK4A4YYOA6","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4f0b3fa9c1d64d5131697e6d3c75b8895e5e75e463a805a08310ca2fe5563334","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-02-02T23:11:09Z","title_canon_sha256":"7d77d6ce1829ade14ca7c7785c86af58e432282f40b3bc8a131a4d272db84adb"},"schema_version":"1.0","source":{"id":"2602.02898","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.02898","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"arxiv_version","alias_value":"2602.02898v2","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.02898","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_12","alias_value":"23LIPQ3BTXVB","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_16","alias_value":"23LIPQ3BTXVB3JPQ","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_8","alias_value":"23LIPQ3B","created_at":"2026-05-28T01:04:36Z"}],"graph_snapshots":[{"event_id":"sha256:d1f1d7dc6b7908e3e8aa16a5009d9a0284ccc72a129db873d074ab6180b16e31","target":"graph","created_at":"2026-05-28T01:04:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.02898/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Language model benchmarks are pervasive and computationally-efficient proxies for real-world performance. However, many recent works find that benchmarks often fail to predict real utility. Towards bridging this gap, we introduce benchmark alignment, where we use limited amounts of information about model performance to automatically update offline benchmarks, aiming to produce new static benchmarks that predict model pairwise preferences in given test settings. We then propose BenchAlign, the first solution to this problem, which learns preference-aligned weight- ings for benchmark questions ","authors_text":"Ahmed Alaa, Hannah Cyberey, Jonathan Richard Schwarz, Marco Gutierrez, Thomas Hartvigsen, Xinyi Leng","cross_cats":["cs.CL"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-02-02T23:11:09Z","title":"Aligning Language Model Benchmarks with Pairwise Preferences"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.02898","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:dfbd3823f33fc8d75fd1afdcdbbe6ce0466b9d2d9cc3e6dbd2499b171124e2ec","target":"record","created_at":"2026-05-28T01:04:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4f0b3fa9c1d64d5131697e6d3c75b8895e5e75e463a805a08310ca2fe5563334","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-02-02T23:11:09Z","title_canon_sha256":"7d77d6ce1829ade14ca7c7785c86af58e432282f40b3bc8a131a4d272db84adb"},"schema_version":"1.0","source":{"id":"2602.02898","kind":"arxiv","version":2}},"canonical_sha256":"d6d687c3619dea1da5f052b80e630e07bf2b1061f67f27e49bc0df0b193988de","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d6d687c3619dea1da5f052b80e630e07bf2b1061f67f27e49bc0df0b193988de","first_computed_at":"2026-05-28T01:04:36.227719Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-28T01:04:36.227719Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"PPpcDz8aq42dYeBzT1xpyHiXwI8Cx3hKTyGsD1f17k3HkLQi6JVS+ubYDFXbcAq670dlaImlZNlCAXwy73ViBA==","signature_status":"signed_v1","signed_at":"2026-05-28T01:04:36.228126Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.02898","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:dfbd3823f33fc8d75fd1afdcdbbe6ce0466b9d2d9cc3e6dbd2499b171124e2ec","sha256:d1f1d7dc6b7908e3e8aa16a5009d9a0284ccc72a129db873d074ab6180b16e31"],"state_sha256":"d56d4bb768a0e6cb3185b0b19482fc462ad2d10c571a75cb0fb6ef727c28a5f6"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1uMf4IqWS4HiXUq7CCo3xJU6oTvH/Hd9EWAegu4qiDWCS9VxebTAptfb5ngrsGrk8IUR2+Fx4SpwOWlXjejmAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-04T19:34:47.521289Z","bundle_sha256":"9406b33d87be066b888ce8344c33e92bfe347ae48d2628f486445b879d5ac75f"}}