{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:6YELB6EVCMTZR5DC6737CTWO6O","short_pith_number":"pith:6YELB6EV","schema_version":"1.0","canonical_sha256":"f608b0f895132798f462f7f7f14ecef3a359351a4058b08628bad9d2ab1c06e3","source":{"kind":"arxiv","id":"2606.29920","version":1},"attestation_state":"computed","paper":{"title":"Can LLM-as-a-Judge Reliably Verify Rubrics in Agentic Scenarios?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Bin Xu, Guanzhong He, Hao Peng, Haotian Xia, Juanzi Li, Lei Hou, Richeng Xuan, Songyuanyi Lu, Xintong Shi, Yangda Peng, Yixian Liu, Yuhong Liu, Yunjia Qi, Zhichao Hu","submitted_at":"2026-06-29T07:57:23Z","abstract_excerpt":"Rubric-based scoring has become a widely used paradigm in model evaluation, typically with LLM-as-a-Judge (LaaJ) for rubric scoring. However, the reliability of LaaJ for rubric scoring remains underexplored. This concern is especially pronounced in agentic scenarios, where long, complex outputs further challenge reliable scoring. To address this, we conduct a systematic meta-evaluation of LaaJ reliability for rubric verification. We introduce RuVerBench, the first benchmark for assessing LaaJ reliability in rubric verification for agentic scenarios. RuVerBench covers two prevalent agentic doma"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.29920","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-06-29T07:57:23Z","cross_cats_sorted":[],"title_canon_sha256":"318c8c78c39965eb5c55d4683305c53259c8f318a0089e3684c8ce50365d1cc5","abstract_canon_sha256":"1e4255e850ee6f9d7e8ba8ede56f2ee2e6238ff32b812ad46a7bcd2c067e536e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:17:41.068048Z","signature_b64":"FH4XmzTZsrkI1f/0UoxZLwI5/mLhFg0aicAWBpawpWa/I+IteH1Kt4mV3mfvhHR5irEL7c+a6YOXpB5YDffIDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f608b0f895132798f462f7f7f14ecef3a359351a4058b08628bad9d2ab1c06e3","last_reissued_at":"2026-06-30T02:17:41.067446Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:17:41.067446Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Can LLM-as-a-Judge Reliably Verify Rubrics in Agentic Scenarios?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Bin Xu, Guanzhong He, Hao Peng, Haotian Xia, Juanzi Li, Lei Hou, Richeng Xuan, Songyuanyi Lu, Xintong Shi, Yangda Peng, Yixian Liu, Yuhong Liu, Yunjia Qi, Zhichao Hu","submitted_at":"2026-06-29T07:57:23Z","abstract_excerpt":"Rubric-based scoring has become a widely used paradigm in model evaluation, typically with LLM-as-a-Judge (LaaJ) for rubric scoring. However, the reliability of LaaJ for rubric scoring remains underexplored. This concern is especially pronounced in agentic scenarios, where long, complex outputs further challenge reliable scoring. To address this, we conduct a systematic meta-evaluation of LaaJ reliability for rubric verification. We introduce RuVerBench, the first benchmark for assessing LaaJ reliability in rubric verification for agentic scenarios. RuVerBench covers two prevalent agentic doma"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.29920","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.29920/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.29920","created_at":"2026-06-30T02:17:41.067531+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.29920v1","created_at":"2026-06-30T02:17:41.067531+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.29920","created_at":"2026-06-30T02:17:41.067531+00:00"},{"alias_kind":"pith_short_12","alias_value":"6YELB6EVCMTZ","created_at":"2026-06-30T02:17:41.067531+00:00"},{"alias_kind":"pith_short_16","alias_value":"6YELB6EVCMTZR5DC","created_at":"2026-06-30T02:17:41.067531+00:00"},{"alias_kind":"pith_short_8","alias_value":"6YELB6EV","created_at":"2026-06-30T02:17:41.067531+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O","json":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O.json","graph_json":"https://pith.science/api/pith-number/6YELB6EVCMTZR5DC6737CTWO6O/graph.json","events_json":"https://pith.science/api/pith-number/6YELB6EVCMTZR5DC6737CTWO6O/events.json","paper":"https://pith.science/paper/6YELB6EV"},"agent_actions":{"view_html":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O","download_json":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O.json","view_paper":"https://pith.science/paper/6YELB6EV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.29920&json=true","fetch_graph":"https://pith.science/api/pith-number/6YELB6EVCMTZR5DC6737CTWO6O/graph.json","fetch_events":"https://pith.science/api/pith-number/6YELB6EVCMTZR5DC6737CTWO6O/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O/action/storage_attestation","attest_author":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O/action/author_attestation","sign_citation":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O/action/citation_signature","submit_replication":"https://pith.science/pith/6YELB6EVCMTZR5DC6737CTWO6O/action/replication_record"}},"created_at":"2026-06-30T02:17:41.067531+00:00","updated_at":"2026-06-30T02:17:41.067531+00:00"}