{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:GUTIB7LEI5Z3QKKOTGC77TVXQF","short_pith_number":"pith:GUTIB7LE","schema_version":"1.0","canonical_sha256":"352680fd644773b8294e9985ffceb78165941e3ccb7e1bf61f22c4a80480d62a","source":{"kind":"arxiv","id":"2606.22961","version":1},"attestation_state":"computed","paper":{"title":"LLM-as-a-Judge for Reliable and Explainable Offline Evaluation in Top-K Recommendation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.IR","authors_text":"Chen Ma, Haiming Jin, Junyi Zhou, Qiao Xiang, Xiaokun Zhang, Yue Que","submitted_at":"2026-06-22T07:42:01Z","abstract_excerpt":"Recommendation evaluation plays a crucial role in guiding the refinement and deployment of recommender systems. Most existing trials rely on offline evaluation using Top-K metrics computed over holdout user behaviors. However, we identify two fundamental limitations that undermine their ability to deliver reliable and explainable evaluations. Regarding reliability, offline evaluation treats observed user feedback as a proxy of true preferences and enforces rigid ID matching between the proxy and recommendation. In practice, feedback collections are inherently shaped by incomplete and biased it"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.22961","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.IR","submitted_at":"2026-06-22T07:42:01Z","cross_cats_sorted":[],"title_canon_sha256":"6de0ecd58d98eddde76d7fdf8eaa6a2d9486527915a9dbea1f98fbd5062bba71","abstract_canon_sha256":"32ae260d128df3e3aa6e6b148088eda93659dc540ea9543c397480feae2781e0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T03:14:05.361056Z","signature_b64":"3lEYSByszgPmYVaLRZ3+gh1iJ4oJY/RS73Bw38MpEWPTAgqygEydUqSoUjQ4ikxTKcOEulPiywy+uwZ3qac7DQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"352680fd644773b8294e9985ffceb78165941e3ccb7e1bf61f22c4a80480d62a","last_reissued_at":"2026-06-23T03:14:05.360711Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T03:14:05.360711Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LLM-as-a-Judge for Reliable and Explainable Offline Evaluation in Top-K Recommendation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.IR","authors_text":"Chen Ma, Haiming Jin, Junyi Zhou, Qiao Xiang, Xiaokun Zhang, Yue Que","submitted_at":"2026-06-22T07:42:01Z","abstract_excerpt":"Recommendation evaluation plays a crucial role in guiding the refinement and deployment of recommender systems. Most existing trials rely on offline evaluation using Top-K metrics computed over holdout user behaviors. However, we identify two fundamental limitations that undermine their ability to deliver reliable and explainable evaluations. Regarding reliability, offline evaluation treats observed user feedback as a proxy of true preferences and enforces rigid ID matching between the proxy and recommendation. In practice, feedback collections are inherently shaped by incomplete and biased it"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.22961","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.22961/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.22961","created_at":"2026-06-23T03:14:05.360768+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.22961v1","created_at":"2026-06-23T03:14:05.360768+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.22961","created_at":"2026-06-23T03:14:05.360768+00:00"},{"alias_kind":"pith_short_12","alias_value":"GUTIB7LEI5Z3","created_at":"2026-06-23T03:14:05.360768+00:00"},{"alias_kind":"pith_short_16","alias_value":"GUTIB7LEI5Z3QKKO","created_at":"2026-06-23T03:14:05.360768+00:00"},{"alias_kind":"pith_short_8","alias_value":"GUTIB7LE","created_at":"2026-06-23T03:14:05.360768+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF","json":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF.json","graph_json":"https://pith.science/api/pith-number/GUTIB7LEI5Z3QKKOTGC77TVXQF/graph.json","events_json":"https://pith.science/api/pith-number/GUTIB7LEI5Z3QKKOTGC77TVXQF/events.json","paper":"https://pith.science/paper/GUTIB7LE"},"agent_actions":{"view_html":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF","download_json":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF.json","view_paper":"https://pith.science/paper/GUTIB7LE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.22961&json=true","fetch_graph":"https://pith.science/api/pith-number/GUTIB7LEI5Z3QKKOTGC77TVXQF/graph.json","fetch_events":"https://pith.science/api/pith-number/GUTIB7LEI5Z3QKKOTGC77TVXQF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF/action/storage_attestation","attest_author":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF/action/author_attestation","sign_citation":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF/action/citation_signature","submit_replication":"https://pith.science/pith/GUTIB7LEI5Z3QKKOTGC77TVXQF/action/replication_record"}},"created_at":"2026-06-23T03:14:05.360768+00:00","updated_at":"2026-06-23T03:14:05.360768+00:00"}