{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:2LPQHPGHTBM5B6DJGEOAWNQUMD","short_pith_number":"pith:2LPQHPGH","schema_version":"1.0","canonical_sha256":"d2df03bcc79859d0f869311c0b361460eabd530635fa69c50ea713a4e346b007","source":{"kind":"arxiv","id":"2606.25388","version":1},"attestation_state":"computed","paper":{"title":"TabClean: Reusable LLM-Synthesized Programs for Tabular Data Cleaning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Bharat Bhargava, Chunwei Liu, Riteng Zhang, Yibo Wang, Yinghao He, Yongye Su","submitted_at":"2026-06-24T04:35:35Z","abstract_excerpt":"Reliable analytics and machine-learning pipelines depend on clean tabular data, yet production tables often contain missing values, typographical errors, inconsistent formats, violated dependencies, unit mismatches, and ambiguous categorical values. Existing cleaning systems make different trade-offs. Constraint-based systems need experts to specify rules. Learning-based systems need labels or retraining. Recent LLM-based cleaners reduce setup effort, but many call an LLM on rows, cells, or repeated workflow steps, so their cost grows with table size and with every recurring batch.\n  We presen"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.25388","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DB","submitted_at":"2026-06-24T04:35:35Z","cross_cats_sorted":[],"title_canon_sha256":"4c6a11f93ec9f853ddc4d16c60cb4463f9c32283c61bfee9e89d20dec0c222e3","abstract_canon_sha256":"6dc067d6264035f3d7ffcc0b819d3c01e5eacae20f8a0f0590b63f7b90636d56"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-25T01:18:04.004148Z","signature_b64":"MhcUiU6qwrNIeZAeD5Z1RErJaj7O5zXf2T9CZ8mE9b25xXMPFnVehN6G+QQNW6kNa40yhhutk6cJ07VC0+uuBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d2df03bcc79859d0f869311c0b361460eabd530635fa69c50ea713a4e346b007","last_reissued_at":"2026-06-25T01:18:04.003608Z","signature_status":"signed_v1","first_computed_at":"2026-06-25T01:18:04.003608Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"TabClean: Reusable LLM-Synthesized Programs for Tabular Data Cleaning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Bharat Bhargava, Chunwei Liu, Riteng Zhang, Yibo Wang, Yinghao He, Yongye Su","submitted_at":"2026-06-24T04:35:35Z","abstract_excerpt":"Reliable analytics and machine-learning pipelines depend on clean tabular data, yet production tables often contain missing values, typographical errors, inconsistent formats, violated dependencies, unit mismatches, and ambiguous categorical values. Existing cleaning systems make different trade-offs. Constraint-based systems need experts to specify rules. Learning-based systems need labels or retraining. Recent LLM-based cleaners reduce setup effort, but many call an LLM on rows, cells, or repeated workflow steps, so their cost grows with table size and with every recurring batch.\n  We presen"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.25388","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.25388/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.25388","created_at":"2026-06-25T01:18:04.003671+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.25388v1","created_at":"2026-06-25T01:18:04.003671+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.25388","created_at":"2026-06-25T01:18:04.003671+00:00"},{"alias_kind":"pith_short_12","alias_value":"2LPQHPGHTBM5","created_at":"2026-06-25T01:18:04.003671+00:00"},{"alias_kind":"pith_short_16","alias_value":"2LPQHPGHTBM5B6DJ","created_at":"2026-06-25T01:18:04.003671+00:00"},{"alias_kind":"pith_short_8","alias_value":"2LPQHPGH","created_at":"2026-06-25T01:18:04.003671+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD","json":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD.json","graph_json":"https://pith.science/api/pith-number/2LPQHPGHTBM5B6DJGEOAWNQUMD/graph.json","events_json":"https://pith.science/api/pith-number/2LPQHPGHTBM5B6DJGEOAWNQUMD/events.json","paper":"https://pith.science/paper/2LPQHPGH"},"agent_actions":{"view_html":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD","download_json":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD.json","view_paper":"https://pith.science/paper/2LPQHPGH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.25388&json=true","fetch_graph":"https://pith.science/api/pith-number/2LPQHPGHTBM5B6DJGEOAWNQUMD/graph.json","fetch_events":"https://pith.science/api/pith-number/2LPQHPGHTBM5B6DJGEOAWNQUMD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD/action/storage_attestation","attest_author":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD/action/author_attestation","sign_citation":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD/action/citation_signature","submit_replication":"https://pith.science/pith/2LPQHPGHTBM5B6DJGEOAWNQUMD/action/replication_record"}},"created_at":"2026-06-25T01:18:04.003671+00:00","updated_at":"2026-06-25T01:18:04.003671+00:00"}