{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ISXL2LIVGPUQZK7WVCNWC47HLP","short_pith_number":"pith:ISXL2LIV","schema_version":"1.0","canonical_sha256":"44aebd2d1533e90cabf6a89b6173e75bd591e3267231cbad1b403bae4043db9c","source":{"kind":"arxiv","id":"2605.15588","version":1},"attestation_state":"computed","paper":{"title":"Calibrating LLMs with Semantic-level Reward","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Dongxia Wu, Fengfei Yu, Rose Yu, Ruijia Niu, Yian Ma","submitted_at":"2026-05-15T03:55:11Z","abstract_excerpt":"As large language models (LLMs) are deployed in consequential settings such as medical question answering and legal reasoning, the ability to estimate when their outputs are likely to be correct is essential for safe and reliable use, requiring well-calibrated uncertainty. Standard reinforcement learning with verifiable rewards (RLVR) trains models with a binary correctness reward that is indifferent to confidence, providing no penalty for confident but wrong predictions and thereby degrading calibration. Recent work addresses this by training models to produce verbalized confidence scores alo"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.15588","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-15T03:55:11Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"dcba01a3fbb10ee7ab4994076d6ed9ddde7e7091b5bc94eaef462d18e700a6ef","abstract_canon_sha256":"66fd3a229447d27590bdb7abf4586ddcaaab28cbfeac706c777e3088a595af14"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:06.858267Z","signature_b64":"nck2E/nm0clHaHUqTmjwjfXk/4JMFHQi3k5zXBVBL+6LxG4rLcz52ksLdDihvQ92/A+zyRq1kvFJbXGvej6gDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"44aebd2d1533e90cabf6a89b6173e75bd591e3267231cbad1b403bae4043db9c","last_reissued_at":"2026-05-20T00:01:06.857475Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:06.857475Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Calibrating LLMs with Semantic-level Reward","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Dongxia Wu, Fengfei Yu, Rose Yu, Ruijia Niu, Yian Ma","submitted_at":"2026-05-15T03:55:11Z","abstract_excerpt":"As large language models (LLMs) are deployed in consequential settings such as medical question answering and legal reasoning, the ability to estimate when their outputs are likely to be correct is essential for safe and reliable use, requiring well-calibrated uncertainty. Standard reinforcement learning with verifiable rewards (RLVR) trains models with a binary correctness reward that is indifferent to confidence, providing no penalty for confident but wrong predictions and thereby degrading calibration. Recent work addresses this by training models to produce verbalized confidence scores alo"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.15588","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.15588/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-19T19:34:35.243460Z","status":"skipped","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T17:41:56.065039Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"7bfc34297c00b094d6976d37fc21c7ea8469f580c12a1ec8b796a379f13b9786"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.15588","created_at":"2026-05-20T00:01:06.857605+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.15588v1","created_at":"2026-05-20T00:01:06.857605+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15588","created_at":"2026-05-20T00:01:06.857605+00:00"},{"alias_kind":"pith_short_12","alias_value":"ISXL2LIVGPUQ","created_at":"2026-05-20T00:01:06.857605+00:00"},{"alias_kind":"pith_short_16","alias_value":"ISXL2LIVGPUQZK7W","created_at":"2026-05-20T00:01:06.857605+00:00"},{"alias_kind":"pith_short_8","alias_value":"ISXL2LIV","created_at":"2026-05-20T00:01:06.857605+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP","json":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP.json","graph_json":"https://pith.science/api/pith-number/ISXL2LIVGPUQZK7WVCNWC47HLP/graph.json","events_json":"https://pith.science/api/pith-number/ISXL2LIVGPUQZK7WVCNWC47HLP/events.json","paper":"https://pith.science/paper/ISXL2LIV"},"agent_actions":{"view_html":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP","download_json":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP.json","view_paper":"https://pith.science/paper/ISXL2LIV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.15588&json=true","fetch_graph":"https://pith.science/api/pith-number/ISXL2LIVGPUQZK7WVCNWC47HLP/graph.json","fetch_events":"https://pith.science/api/pith-number/ISXL2LIVGPUQZK7WVCNWC47HLP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP/action/storage_attestation","attest_author":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP/action/author_attestation","sign_citation":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP/action/citation_signature","submit_replication":"https://pith.science/pith/ISXL2LIVGPUQZK7WVCNWC47HLP/action/replication_record"}},"created_at":"2026-05-20T00:01:06.857605+00:00","updated_at":"2026-05-20T00:01:06.857605+00:00"}