{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:L4FF2GRS4U3JRTEF565XYU7RAB","short_pith_number":"pith:L4FF2GRS","schema_version":"1.0","canonical_sha256":"5f0a5d1a32e53698cc85efbb7c53f100542ff13c1ca0696bd719601b5090e720","source":{"kind":"arxiv","id":"2606.03001","version":1},"attestation_state":"computed","paper":{"title":"FOLD: Fuzzy Online Deduplication for Very Large Evolving Datasets via Approximate Nearest Neighbor Search","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DC","authors_text":"Constantin Adam, Eyal de Lara, Nelson Bore, Oana Balmau, Pritish Mishra","submitted_at":"2026-06-02T01:16:26Z","abstract_excerpt":"Fuzzy deduplication is key to constructing large language model training corpora. However, classic Locality-Sensitive Hashing pipelines scale poorly as corpora grow and are ill-suited to continuous ingestion. We present FOLD (Fuzzy Online Deduplication), an online fuzzy deduplication system that delivers high recall and throughput for evolving datasets. FOLD maintains an incrementally updated HNSW index over admitted documents, retrieving a small, high-quality candidate neighborhood for each incoming document instead of repeatedly rebuilding global buckets or rescanning the accumulated corpus."},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.03001","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2026-06-02T01:16:26Z","cross_cats_sorted":[],"title_canon_sha256":"8dddb7b2b66836ee7f0b0d0035336ef43d02efac865141380690bc9cad7cc5af","abstract_canon_sha256":"069b11c42876d5ae5c4e826bf27e36843607a3485fa170aaf9d02ee48b3af97e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T01:05:28.989750Z","signature_b64":"LjvpBGrieeHY4txaygqw8XBCiRiHcF6LN0zs8f4iigbvpqikCiv9f+FfQDo6jyLF0MHhXc3vwYlN95cNi30XAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5f0a5d1a32e53698cc85efbb7c53f100542ff13c1ca0696bd719601b5090e720","last_reissued_at":"2026-06-03T01:05:28.989308Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T01:05:28.989308Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FOLD: Fuzzy Online Deduplication for Very Large Evolving Datasets via Approximate Nearest Neighbor Search","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DC","authors_text":"Constantin Adam, Eyal de Lara, Nelson Bore, Oana Balmau, Pritish Mishra","submitted_at":"2026-06-02T01:16:26Z","abstract_excerpt":"Fuzzy deduplication is key to constructing large language model training corpora. However, classic Locality-Sensitive Hashing pipelines scale poorly as corpora grow and are ill-suited to continuous ingestion. We present FOLD (Fuzzy Online Deduplication), an online fuzzy deduplication system that delivers high recall and throughput for evolving datasets. FOLD maintains an incrementally updated HNSW index over admitted documents, retrieving a small, high-quality candidate neighborhood for each incoming document instead of repeatedly rebuilding global buckets or rescanning the accumulated corpus."},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.03001","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.03001/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.03001","created_at":"2026-06-03T01:05:28.989383+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.03001v1","created_at":"2026-06-03T01:05:28.989383+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.03001","created_at":"2026-06-03T01:05:28.989383+00:00"},{"alias_kind":"pith_short_12","alias_value":"L4FF2GRS4U3J","created_at":"2026-06-03T01:05:28.989383+00:00"},{"alias_kind":"pith_short_16","alias_value":"L4FF2GRS4U3JRTEF","created_at":"2026-06-03T01:05:28.989383+00:00"},{"alias_kind":"pith_short_8","alias_value":"L4FF2GRS","created_at":"2026-06-03T01:05:28.989383+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB","json":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB.json","graph_json":"https://pith.science/api/pith-number/L4FF2GRS4U3JRTEF565XYU7RAB/graph.json","events_json":"https://pith.science/api/pith-number/L4FF2GRS4U3JRTEF565XYU7RAB/events.json","paper":"https://pith.science/paper/L4FF2GRS"},"agent_actions":{"view_html":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB","download_json":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB.json","view_paper":"https://pith.science/paper/L4FF2GRS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.03001&json=true","fetch_graph":"https://pith.science/api/pith-number/L4FF2GRS4U3JRTEF565XYU7RAB/graph.json","fetch_events":"https://pith.science/api/pith-number/L4FF2GRS4U3JRTEF565XYU7RAB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB/action/storage_attestation","attest_author":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB/action/author_attestation","sign_citation":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB/action/citation_signature","submit_replication":"https://pith.science/pith/L4FF2GRS4U3JRTEF565XYU7RAB/action/replication_record"}},"created_at":"2026-06-03T01:05:28.989383+00:00","updated_at":"2026-06-03T01:05:28.989383+00:00"}