{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:4C5WDQICWM6QPCNENRTJX5RBY3","short_pith_number":"pith:4C5WDQIC","schema_version":"1.0","canonical_sha256":"e0bb61c102b33d0789a46c669bf621c6dd3460192fbe9ce914d3b4bb1e36dbdc","source":{"kind":"arxiv","id":"2606.23403","version":1},"attestation_state":"computed","paper":{"title":"Litmus: Zero-Label, Code-Driven Metric Specification for Evaluating AI Systems","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Apoorva Sharma, Kevin Paul, Prajjwal Gupta, Prasang Gupta, Sumanth Chundru, Vishal Bhutani, Waqar Sarguroh","submitted_at":"2026-06-22T14:26:48Z","abstract_excerpt":"As agentic LLM systems move from prototypes to deployment across increasingly diverse domains, evaluating them has become both more important and more difficult. The challenge is not only that individual metrics may be unreliable, but that evaluation goals are often left implicit. Without a clear account of what a system is expected to do, how it can fail, and which failures matter, metric choices become difficult to justify, interpret, or validate. We present Litmus, a zero-label system that designs evaluation and monitoring metrics for AI pipelines by eliciting evaluation intent from source "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.23403","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-22T14:26:48Z","cross_cats_sorted":[],"title_canon_sha256":"b4ba5a2201843d274c80e716b6007a6cda15cfcfcad2d86006b5dabd90feaa2a","abstract_canon_sha256":"93691792e876ef0df1b74e3937b153b231b04b8062cd204733722d9e6c84bda4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T03:14:19.145629Z","signature_b64":"rTHEGG7TOEBKvM7BaCQKY0XOUTFpMC1mAZTkjFUtZEneZi7lCjCPBedsTvZ7GEkfFTqznuacXR6OsadIsouiAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e0bb61c102b33d0789a46c669bf621c6dd3460192fbe9ce914d3b4bb1e36dbdc","last_reissued_at":"2026-06-23T03:14:19.145221Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T03:14:19.145221Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Litmus: Zero-Label, Code-Driven Metric Specification for Evaluating AI Systems","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Apoorva Sharma, Kevin Paul, Prajjwal Gupta, Prasang Gupta, Sumanth Chundru, Vishal Bhutani, Waqar Sarguroh","submitted_at":"2026-06-22T14:26:48Z","abstract_excerpt":"As agentic LLM systems move from prototypes to deployment across increasingly diverse domains, evaluating them has become both more important and more difficult. The challenge is not only that individual metrics may be unreliable, but that evaluation goals are often left implicit. Without a clear account of what a system is expected to do, how it can fail, and which failures matter, metric choices become difficult to justify, interpret, or validate. We present Litmus, a zero-label system that designs evaluation and monitoring metrics for AI pipelines by eliciting evaluation intent from source "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.23403","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.23403/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.23403","created_at":"2026-06-23T03:14:19.145273+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.23403v1","created_at":"2026-06-23T03:14:19.145273+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.23403","created_at":"2026-06-23T03:14:19.145273+00:00"},{"alias_kind":"pith_short_12","alias_value":"4C5WDQICWM6Q","created_at":"2026-06-23T03:14:19.145273+00:00"},{"alias_kind":"pith_short_16","alias_value":"4C5WDQICWM6QPCNE","created_at":"2026-06-23T03:14:19.145273+00:00"},{"alias_kind":"pith_short_8","alias_value":"4C5WDQIC","created_at":"2026-06-23T03:14:19.145273+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3","json":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3.json","graph_json":"https://pith.science/api/pith-number/4C5WDQICWM6QPCNENRTJX5RBY3/graph.json","events_json":"https://pith.science/api/pith-number/4C5WDQICWM6QPCNENRTJX5RBY3/events.json","paper":"https://pith.science/paper/4C5WDQIC"},"agent_actions":{"view_html":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3","download_json":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3.json","view_paper":"https://pith.science/paper/4C5WDQIC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.23403&json=true","fetch_graph":"https://pith.science/api/pith-number/4C5WDQICWM6QPCNENRTJX5RBY3/graph.json","fetch_events":"https://pith.science/api/pith-number/4C5WDQICWM6QPCNENRTJX5RBY3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3/action/storage_attestation","attest_author":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3/action/author_attestation","sign_citation":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3/action/citation_signature","submit_replication":"https://pith.science/pith/4C5WDQICWM6QPCNENRTJX5RBY3/action/replication_record"}},"created_at":"2026-06-23T03:14:19.145273+00:00","updated_at":"2026-06-23T03:14:19.145273+00:00"}