{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:65R7KA66TKKJELWLEL6GCNEDY5","short_pith_number":"pith:65R7KA66","canonical_record":{"source":{"id":"2605.02122","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-04T01:03:48Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"207f94c72b77219b78e201130c30d6d1464da1323029d97f049bada399110d29","abstract_canon_sha256":"5d3e78a0e5f93f1b7a658e9ec2bb4ec9d46f3592a37701079bba0462aebb27e9"},"schema_version":"1.0"},"canonical_sha256":"f763f503de9a94922ecb22fc613483c77e919378d6ab724a677a01b80d15fa60","source":{"kind":"arxiv","id":"2605.02122","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.02122","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"arxiv_version","alias_value":"2605.02122v2","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.02122","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"pith_short_12","alias_value":"65R7KA66TKKJ","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"pith_short_16","alias_value":"65R7KA66TKKJELWL","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"pith_short_8","alias_value":"65R7KA66","created_at":"2026-06-02T03:05:05Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:65R7KA66TKKJELWLEL6GCNEDY5","target":"record","payload":{"canonical_record":{"source":{"id":"2605.02122","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-04T01:03:48Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"207f94c72b77219b78e201130c30d6d1464da1323029d97f049bada399110d29","abstract_canon_sha256":"5d3e78a0e5f93f1b7a658e9ec2bb4ec9d46f3592a37701079bba0462aebb27e9"},"schema_version":"1.0"},"canonical_sha256":"f763f503de9a94922ecb22fc613483c77e919378d6ab724a677a01b80d15fa60","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T03:05:05.527916Z","signature_b64":"OmKt/61zGXugUQKRcDPj/VcfkCB7I+hNxU0fy4qhwczu19fjvADhF38nktuDV8xMu0VwMSuCCRxdF/wnnFZXBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f763f503de9a94922ecb22fc613483c77e919378d6ab724a677a01b80d15fa60","last_reissued_at":"2026-06-02T03:05:05.527338Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T03:05:05.527338Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.02122","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T03:05:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"2vZwh/o02IltCQiK5VHBwWzJijBwmdm59N0300P8rLmygkuhJEAtTKH8syG2KA6CRcE1AcqmvqYdTQKhFKgbCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-28T09:26:28.364733Z"},"content_sha256":"55b971d82a6197751a1f2a59f8276754cedd155ccee94de70eb40aba8d91318e","schema_version":"1.0","event_id":"sha256:55b971d82a6197751a1f2a59f8276754cedd155ccee94de70eb40aba8d91318e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:65R7KA66TKKJELWLEL6GCNEDY5","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"STABLEVAL: Disagreement-Aware and Stable Evaluation of AI Systems","license":"http://creativecommons.org/licenses/by/4.0/","headline":"STABLEVAL models latent item correctness and annotator confusion to produce stable AI system rankings where majority vote fails.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Akash Bonagiri, Angelina Lai, Devang Borkar, Gerard Janno Anderias, Gezheng Kang, Houman Homayoun, Ishant Gandhi, Saee Patil, Setareh Rafatirad","submitted_at":"2026-05-04T01:03:48Z","abstract_excerpt":"Human evaluation remains the primary standard for assessing modern AI systems, yet annotator disagreement, bias, and variability make system rankings fragile under standard majority vote aggregation. Majority vote discards annotator reliability and item-level ambiguity, often yielding unstable comparisons across annotator subsets. We introduce STABLEVAL, a disagreement-aware evaluation framework that models latent item correctness and annotator-specific confusion patterns to produce posterior expected item credit and calibrated agent-level scores. Unlike label-denoising approaches such as Dawi"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across controlled synthetic experiments and multiple real-world human-annotated benchmarks, majority vote exhibits increasing score error and ranking instability under annotator heterogeneity and adversarial noise, while STABLEVAL yields more stable and statistically grounded system rankings.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the chosen probabilistic model of latent item correctness and annotator-specific confusion patterns will produce posteriors that genuinely reflect real-world stability rather than artifacts of the modeling assumptions or fitting procedure.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"STABLEVAL models latent correctness and annotator confusion to deliver more stable and uncertainty-aware AI system rankings than majority-vote aggregation.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"STABLEVAL models latent item correctness and annotator confusion to produce stable AI system rankings where majority vote fails.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"8662a2ab4e73be1dc8a4b48646162d6fd642cde0293a7ae4ba74ec7b45e5297f"},"source":{"id":"2605.02122","kind":"arxiv","version":2},"verdict":{"id":"1caad419-2501-4122-ab2c-da5aef72723c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-09T16:49:03.882664Z","strongest_claim":"Across controlled synthetic experiments and multiple real-world human-annotated benchmarks, majority vote exhibits increasing score error and ranking instability under annotator heterogeneity and adversarial noise, while STABLEVAL yields more stable and statistically grounded system rankings.","one_line_summary":"STABLEVAL models latent correctness and annotator confusion to deliver more stable and uncertainty-aware AI system rankings than majority-vote aggregation.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the chosen probabilistic model of latent item correctness and annotator-specific confusion patterns will produce posteriors that genuinely reflect real-world stability rather than artifacts of the modeling assumptions or fitting procedure.","pith_extraction_headline":"STABLEVAL models latent item correctness and annotator confusion to produce stable AI system rankings where majority vote fails."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.02122/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-20T16:37:36.509355Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-20T04:01:22.628342Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T16:39:22.359550Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"7fb07c8fa32cbb26bcf995b50ce9979d3fd4ad7c0fcb6edb4dd21220811ffd92"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1caad419-2501-4122-ab2c-da5aef72723c"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T03:05:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1q31y+t4/A/mdQreTneSYPMqllait0dHmCqyB3uixQGIf58qPcsxNdZ43aJnQmssj8wx5Q25UuR4e+xbLTKLBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-28T09:26:28.365220Z"},"content_sha256":"71ac32f83e84caacbf3b48be61ff48504092e1463788918d7d1f15cfb34d7d60","schema_version":"1.0","event_id":"sha256:71ac32f83e84caacbf3b48be61ff48504092e1463788918d7d1f15cfb34d7d60"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/65R7KA66TKKJELWLEL6GCNEDY5/bundle.json","state_url":"https://pith.science/pith/65R7KA66TKKJELWLEL6GCNEDY5/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/65R7KA66TKKJELWLEL6GCNEDY5/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-28T09:26:28Z","links":{"resolver":"https://pith.science/pith/65R7KA66TKKJELWLEL6GCNEDY5","bundle":"https://pith.science/pith/65R7KA66TKKJELWLEL6GCNEDY5/bundle.json","state":"https://pith.science/pith/65R7KA66TKKJELWLEL6GCNEDY5/state.json","well_known_bundle":"https://pith.science/.well-known/pith/65R7KA66TKKJELWLEL6GCNEDY5/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:65R7KA66TKKJELWLEL6GCNEDY5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5d3e78a0e5f93f1b7a658e9ec2bb4ec9d46f3592a37701079bba0462aebb27e9","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-04T01:03:48Z","title_canon_sha256":"207f94c72b77219b78e201130c30d6d1464da1323029d97f049bada399110d29"},"schema_version":"1.0","source":{"id":"2605.02122","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.02122","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"arxiv_version","alias_value":"2605.02122v2","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.02122","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"pith_short_12","alias_value":"65R7KA66TKKJ","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"pith_short_16","alias_value":"65R7KA66TKKJELWL","created_at":"2026-06-02T03:05:05Z"},{"alias_kind":"pith_short_8","alias_value":"65R7KA66","created_at":"2026-06-02T03:05:05Z"}],"graph_snapshots":[{"event_id":"sha256:71ac32f83e84caacbf3b48be61ff48504092e1463788918d7d1f15cfb34d7d60","target":"graph","created_at":"2026-06-02T03:05:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across controlled synthetic experiments and multiple real-world human-annotated benchmarks, majority vote exhibits increasing score error and ranking instability under annotator heterogeneity and adversarial noise, while STABLEVAL yields more stable and statistically grounded system rankings."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the chosen probabilistic model of latent item correctness and annotator-specific confusion patterns will produce posteriors that genuinely reflect real-world stability rather than artifacts of the modeling assumptions or fitting procedure."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"STABLEVAL models latent correctness and annotator confusion to deliver more stable and uncertainty-aware AI system rankings than majority-vote aggregation."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"STABLEVAL models latent item correctness and annotator confusion to produce stable AI system rankings where majority vote fails."}],"snapshot_sha256":"8662a2ab4e73be1dc8a4b48646162d6fd642cde0293a7ae4ba74ec7b45e5297f"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-20T16:37:36.509355Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-20T04:01:22.628342Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T16:39:22.359550Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.02122/integrity.json","findings":[],"snapshot_sha256":"7fb07c8fa32cbb26bcf995b50ce9979d3fd4ad7c0fcb6edb4dd21220811ffd92","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Human evaluation remains the primary standard for assessing modern AI systems, yet annotator disagreement, bias, and variability make system rankings fragile under standard majority vote aggregation. Majority vote discards annotator reliability and item-level ambiguity, often yielding unstable comparisons across annotator subsets. We introduce STABLEVAL, a disagreement-aware evaluation framework that models latent item correctness and annotator-specific confusion patterns to produce posterior expected item credit and calibrated agent-level scores. Unlike label-denoising approaches such as Dawi","authors_text":"Akash Bonagiri, Angelina Lai, Devang Borkar, Gerard Janno Anderias, Gezheng Kang, Houman Homayoun, Ishant Gandhi, Saee Patil, Setareh Rafatirad","cross_cats":["cs.AI"],"headline":"STABLEVAL models latent item correctness and annotator confusion to produce stable AI system rankings where majority vote fails.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-04T01:03:48Z","title":"STABLEVAL: Disagreement-Aware and Stable Evaluation of AI Systems"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.02122","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-09T16:49:03.882664Z","id":"1caad419-2501-4122-ab2c-da5aef72723c","model_set":{"reader":"grok-4.3"},"one_line_summary":"STABLEVAL models latent correctness and annotator confusion to deliver more stable and uncertainty-aware AI system rankings than majority-vote aggregation.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"STABLEVAL models latent item correctness and annotator confusion to produce stable AI system rankings where majority vote fails.","strongest_claim":"Across controlled synthetic experiments and multiple real-world human-annotated benchmarks, majority vote exhibits increasing score error and ranking instability under annotator heterogeneity and adversarial noise, while STABLEVAL yields more stable and statistically grounded system rankings.","weakest_assumption":"That the chosen probabilistic model of latent item correctness and annotator-specific confusion patterns will produce posteriors that genuinely reflect real-world stability rather than artifacts of the modeling assumptions or fitting procedure."}},"verdict_id":"1caad419-2501-4122-ab2c-da5aef72723c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:55b971d82a6197751a1f2a59f8276754cedd155ccee94de70eb40aba8d91318e","target":"record","created_at":"2026-06-02T03:05:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5d3e78a0e5f93f1b7a658e9ec2bb4ec9d46f3592a37701079bba0462aebb27e9","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-04T01:03:48Z","title_canon_sha256":"207f94c72b77219b78e201130c30d6d1464da1323029d97f049bada399110d29"},"schema_version":"1.0","source":{"id":"2605.02122","kind":"arxiv","version":2}},"canonical_sha256":"f763f503de9a94922ecb22fc613483c77e919378d6ab724a677a01b80d15fa60","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f763f503de9a94922ecb22fc613483c77e919378d6ab724a677a01b80d15fa60","first_computed_at":"2026-06-02T03:05:05.527338Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T03:05:05.527338Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"OmKt/61zGXugUQKRcDPj/VcfkCB7I+hNxU0fy4qhwczu19fjvADhF38nktuDV8xMu0VwMSuCCRxdF/wnnFZXBA==","signature_status":"signed_v1","signed_at":"2026-06-02T03:05:05.527916Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.02122","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:55b971d82a6197751a1f2a59f8276754cedd155ccee94de70eb40aba8d91318e","sha256:71ac32f83e84caacbf3b48be61ff48504092e1463788918d7d1f15cfb34d7d60"],"state_sha256":"8b3bfb8967b8c92b41422365e5da9580e93a51b01303daf7f4bd63ed3614b36a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3n3U83AdFbygINzFGyTMiXd7yTyYfw/P7dKl3oXMnsnxLJVoxFmD5VB+iqyGv4X35wVXD2KdT+a0a9RZGv9/AQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-28T09:26:28.367657Z","bundle_sha256":"7d4b3810a1465535b292ee72bd5021ec437088bad606e39e726aeb70329a73e9"}}