{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:3L2HJXP4BXKIT3Q7MODPLU6B4G","short_pith_number":"pith:3L2HJXP4","schema_version":"1.0","canonical_sha256":"daf474ddfc0dd489ee1f6386f5d3c1e1a8cd08b462a5533c28ec599c12511717","source":{"kind":"arxiv","id":"2604.01904","version":2},"attestation_state":"computed","paper":{"title":"Combating Data Laundering in LLM Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Feng Liu, Muxing Li, Sharon Li, Zesheng Ye","submitted_at":"2026-04-02T11:19:49Z","abstract_excerpt":"Data rights owners can detect unauthorized data use in large language model (LLM) training by querying with proprietary samples. Often, superior performance (e.g., higher confidence or lower loss) on a sample relative to the untrained data implies it was part of the training corpus, as LLMs tend to perform better on data they have seen during training. However, this detection becomes fragile under data laundering, a practice of transforming the stylistic form of proprietary data, while preserving critical information to obfuscate data provenance. When an LLM is trained exclusively on such laun"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2604.01904","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-04-02T11:19:49Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a7bddefc960448db626ab16f960d7ac9086ae91a0266abd765d2fd8b00c87c51","abstract_canon_sha256":"5ca247c6849c621acd1e5ef22424cdef2696c3a9dc1ee2e2863cf08e5660ae69"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T02:05:44.152201Z","signature_b64":"ne301cyHlsiKlPHcjWaAxwSCmViumMUBNox5osAIM3DgXkoigA6FXgXA7id2g3hurrBzobMa5t01QeQ0T2B1Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"daf474ddfc0dd489ee1f6386f5d3c1e1a8cd08b462a5533c28ec599c12511717","last_reissued_at":"2026-05-29T02:05:44.151203Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T02:05:44.151203Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Combating Data Laundering in LLM Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Feng Liu, Muxing Li, Sharon Li, Zesheng Ye","submitted_at":"2026-04-02T11:19:49Z","abstract_excerpt":"Data rights owners can detect unauthorized data use in large language model (LLM) training by querying with proprietary samples. Often, superior performance (e.g., higher confidence or lower loss) on a sample relative to the untrained data implies it was part of the training corpus, as LLMs tend to perform better on data they have seen during training. However, this detection becomes fragile under data laundering, a practice of transforming the stylistic form of proprietary data, while preserving critical information to obfuscate data provenance. When an LLM is trained exclusively on such laun"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.01904","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.01904/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.01904","created_at":"2026-05-29T02:05:44.151350+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.01904v2","created_at":"2026-05-29T02:05:44.151350+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.01904","created_at":"2026-05-29T02:05:44.151350+00:00"},{"alias_kind":"pith_short_12","alias_value":"3L2HJXP4BXKI","created_at":"2026-05-29T02:05:44.151350+00:00"},{"alias_kind":"pith_short_16","alias_value":"3L2HJXP4BXKIT3Q7","created_at":"2026-05-29T02:05:44.151350+00:00"},{"alias_kind":"pith_short_8","alias_value":"3L2HJXP4","created_at":"2026-05-29T02:05:44.151350+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G","json":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G.json","graph_json":"https://pith.science/api/pith-number/3L2HJXP4BXKIT3Q7MODPLU6B4G/graph.json","events_json":"https://pith.science/api/pith-number/3L2HJXP4BXKIT3Q7MODPLU6B4G/events.json","paper":"https://pith.science/paper/3L2HJXP4"},"agent_actions":{"view_html":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G","download_json":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G.json","view_paper":"https://pith.science/paper/3L2HJXP4","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.01904&json=true","fetch_graph":"https://pith.science/api/pith-number/3L2HJXP4BXKIT3Q7MODPLU6B4G/graph.json","fetch_events":"https://pith.science/api/pith-number/3L2HJXP4BXKIT3Q7MODPLU6B4G/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G/action/storage_attestation","attest_author":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G/action/author_attestation","sign_citation":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G/action/citation_signature","submit_replication":"https://pith.science/pith/3L2HJXP4BXKIT3Q7MODPLU6B4G/action/replication_record"}},"created_at":"2026-05-29T02:05:44.151350+00:00","updated_at":"2026-05-29T02:05:44.151350+00:00"}