{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:XKVMUCVMEQOBIVTNG63ILYASG5","short_pith_number":"pith:XKVMUCVM","schema_version":"1.0","canonical_sha256":"baaaca0aac241c14566d37b685e0123745aa53385a5ab9c59190c2677b498cf9","source":{"kind":"arxiv","id":"2606.03043","version":1},"attestation_state":"computed","paper":{"title":"The Geometry of LLM-as-Judge: Why Inter-LLM Consensus Is Not Human Alignment","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Hamna Hamna, Kalika Bali, Sourabrata Mukherjee, Sunayana Sitaram","submitted_at":"2026-06-02T02:26:18Z","abstract_excerpt":"LMs-as-judges are now standard, yet judges agree strongly with one another while agreeing only weakly with humans. We test whether this reflects shared signal or shared bias by measuring four geometric quantities on the standard LLM-as-judge stack across four community-built Indic datasets, eight Indic languages, and 41 LLM judges: score spread, effective rank, principal angle to the human subspace, and stacked correlations among judges and humans, all with bootstrap confidence intervals.\n  On subjective rubrics, judges use less than half the human score range ($\\sigma_J / \\sigma_H \\approx 0.3"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.03043","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-02T02:26:18Z","cross_cats_sorted":[],"title_canon_sha256":"901d046ab92e2ba6acaca27b815ab38a90bf35d51945c2d3f43a6534c8f52e67","abstract_canon_sha256":"594a7adda51ef47e23587df6bca955ad5d3ebd7e74ec0a7489d68fabb1ddc715"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T01:05:30.090719Z","signature_b64":"8Sdff0Tc91k0DNhPgcEJjANIzF659NeTtsvbHoknsE1RcJhOuJPn4wbbcXJ4OJjIZ54Ugugfbr0k40itgWI2DQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"baaaca0aac241c14566d37b685e0123745aa53385a5ab9c59190c2677b498cf9","last_reissued_at":"2026-06-03T01:05:30.090327Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T01:05:30.090327Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"The Geometry of LLM-as-Judge: Why Inter-LLM Consensus Is Not Human Alignment","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Hamna Hamna, Kalika Bali, Sourabrata Mukherjee, Sunayana Sitaram","submitted_at":"2026-06-02T02:26:18Z","abstract_excerpt":"LMs-as-judges are now standard, yet judges agree strongly with one another while agreeing only weakly with humans. We test whether this reflects shared signal or shared bias by measuring four geometric quantities on the standard LLM-as-judge stack across four community-built Indic datasets, eight Indic languages, and 41 LLM judges: score spread, effective rank, principal angle to the human subspace, and stacked correlations among judges and humans, all with bootstrap confidence intervals.\n  On subjective rubrics, judges use less than half the human score range ($\\sigma_J / \\sigma_H \\approx 0.3"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.03043","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.03043/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.03043","created_at":"2026-06-03T01:05:30.090382+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.03043v1","created_at":"2026-06-03T01:05:30.090382+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.03043","created_at":"2026-06-03T01:05:30.090382+00:00"},{"alias_kind":"pith_short_12","alias_value":"XKVMUCVMEQOB","created_at":"2026-06-03T01:05:30.090382+00:00"},{"alias_kind":"pith_short_16","alias_value":"XKVMUCVMEQOBIVTN","created_at":"2026-06-03T01:05:30.090382+00:00"},{"alias_kind":"pith_short_8","alias_value":"XKVMUCVM","created_at":"2026-06-03T01:05:30.090382+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5","json":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5.json","graph_json":"https://pith.science/api/pith-number/XKVMUCVMEQOBIVTNG63ILYASG5/graph.json","events_json":"https://pith.science/api/pith-number/XKVMUCVMEQOBIVTNG63ILYASG5/events.json","paper":"https://pith.science/paper/XKVMUCVM"},"agent_actions":{"view_html":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5","download_json":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5.json","view_paper":"https://pith.science/paper/XKVMUCVM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.03043&json=true","fetch_graph":"https://pith.science/api/pith-number/XKVMUCVMEQOBIVTNG63ILYASG5/graph.json","fetch_events":"https://pith.science/api/pith-number/XKVMUCVMEQOBIVTNG63ILYASG5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5/action/storage_attestation","attest_author":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5/action/author_attestation","sign_citation":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5/action/citation_signature","submit_replication":"https://pith.science/pith/XKVMUCVMEQOBIVTNG63ILYASG5/action/replication_record"}},"created_at":"2026-06-03T01:05:30.090382+00:00","updated_at":"2026-06-03T01:05:30.090382+00:00"}