{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:QVD4L32Z2F5ALHWMTX4IZCJSKQ","short_pith_number":"pith:QVD4L32Z","canonical_record":{"source":{"id":"2606.24998","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-23T16:02:40Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"f315da6f562fb1d59436e1186c94297f650d9bd8b235c1536e56ee49241d251c","abstract_canon_sha256":"01946e2fa0e7047c2ae67b60a0c81192c32dc99b4036e8586f173f4018aec15e"},"schema_version":"1.0"},"canonical_sha256":"8547c5ef59d17a059ecc9df88c8932541eb817378dfcf055f3946ab3bf6f7b87","source":{"kind":"arxiv","id":"2606.24998","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.24998","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"arxiv_version","alias_value":"2606.24998v1","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.24998","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"pith_short_12","alias_value":"QVD4L32Z2F5A","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"pith_short_16","alias_value":"QVD4L32Z2F5ALHWM","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"pith_short_8","alias_value":"QVD4L32Z","created_at":"2026-06-25T00:17:47Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:QVD4L32Z2F5ALHWMTX4IZCJSKQ","target":"record","payload":{"canonical_record":{"source":{"id":"2606.24998","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-23T16:02:40Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"f315da6f562fb1d59436e1186c94297f650d9bd8b235c1536e56ee49241d251c","abstract_canon_sha256":"01946e2fa0e7047c2ae67b60a0c81192c32dc99b4036e8586f173f4018aec15e"},"schema_version":"1.0"},"canonical_sha256":"8547c5ef59d17a059ecc9df88c8932541eb817378dfcf055f3946ab3bf6f7b87","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-25T00:17:47.875772Z","signature_b64":"QkLNXLQrz4ZW/9Caj7AzVynF9S7x8Up6vchOH9FKhJtzveVtaptWod+lnEDPRtc7D232uiojTkKUMncvgO0ZDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8547c5ef59d17a059ecc9df88c8932541eb817378dfcf055f3946ab3bf6f7b87","last_reissued_at":"2026-06-25T00:17:47.875367Z","signature_status":"signed_v1","first_computed_at":"2026-06-25T00:17:47.875367Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.24998","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-25T00:17:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Z7e78tuKtnas15rS+LdFTHlvvT2AXxnQ678YIbgfvl955hBeh0g8fws3b+ylGGSKssBxYQ4fSK9YPY527B8nBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-27T01:42:51.187513Z"},"content_sha256":"274d4dfc0856c9aeb4c0bc6ec0f782d392a87306b06e1c20cb083055b0bec51f","schema_version":"1.0","event_id":"sha256:274d4dfc0856c9aeb4c0bc6ec0f782d392a87306b06e1c20cb083055b0bec51f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:QVD4L32Z2F5ALHWMTX4IZCJSKQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Internal Data Repetition Destroys Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Bo He, David Donoho, Jessica Chudnovsky, Joshua Kazdan, Mehmet Donmez, Noam Levi, Rylan Schaeffer, Sanmi Koyejo, Yegor Denisov-Blanch","submitted_at":"2026-06-23T16:02:40Z","abstract_excerpt":"Language models are running out of high-quality training data, and even aggressively deduplicated corpora retain some amount of repetition. Earlier controlled studies predated Chinchilla-style scaling laws and could only measure the cost of repetition indirectly. We revisit repetition in the Chinchilla era, using a fitted no-repetition scaling law to report Compute-Equivalent Gain and Compute-Equivalent Loss. We show that under this modernized paradigm, repetition damage is systematic in three ways. First, holding compute allocated to repeated data constant, eval loss peaks at an intermediate "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.24998","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.24998/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-25T00:17:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Hlp/oBmbA4rEcfe6o9VfGnhbjiwrTPFrZcB8Ke2JKIKmc1Q7Jsm1p+5ehUBqPbwuzc8hzDxGupiqWtX7YRj4Bg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-27T01:42:51.187890Z"},"content_sha256":"aef23d8bb6662b199a34a6b2cef0a566dc924213133fd4bf60aebc1684d2be20","schema_version":"1.0","event_id":"sha256:aef23d8bb6662b199a34a6b2cef0a566dc924213133fd4bf60aebc1684d2be20"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/QVD4L32Z2F5ALHWMTX4IZCJSKQ/bundle.json","state_url":"https://pith.science/pith/QVD4L32Z2F5ALHWMTX4IZCJSKQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/QVD4L32Z2F5ALHWMTX4IZCJSKQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-27T01:42:51Z","links":{"resolver":"https://pith.science/pith/QVD4L32Z2F5ALHWMTX4IZCJSKQ","bundle":"https://pith.science/pith/QVD4L32Z2F5ALHWMTX4IZCJSKQ/bundle.json","state":"https://pith.science/pith/QVD4L32Z2F5ALHWMTX4IZCJSKQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/QVD4L32Z2F5ALHWMTX4IZCJSKQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:QVD4L32Z2F5ALHWMTX4IZCJSKQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"01946e2fa0e7047c2ae67b60a0c81192c32dc99b4036e8586f173f4018aec15e","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-23T16:02:40Z","title_canon_sha256":"f315da6f562fb1d59436e1186c94297f650d9bd8b235c1536e56ee49241d251c"},"schema_version":"1.0","source":{"id":"2606.24998","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.24998","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"arxiv_version","alias_value":"2606.24998v1","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.24998","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"pith_short_12","alias_value":"QVD4L32Z2F5A","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"pith_short_16","alias_value":"QVD4L32Z2F5ALHWM","created_at":"2026-06-25T00:17:47Z"},{"alias_kind":"pith_short_8","alias_value":"QVD4L32Z","created_at":"2026-06-25T00:17:47Z"}],"graph_snapshots":[{"event_id":"sha256:aef23d8bb6662b199a34a6b2cef0a566dc924213133fd4bf60aebc1684d2be20","target":"graph","created_at":"2026-06-25T00:17:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.24998/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Language models are running out of high-quality training data, and even aggressively deduplicated corpora retain some amount of repetition. Earlier controlled studies predated Chinchilla-style scaling laws and could only measure the cost of repetition indirectly. We revisit repetition in the Chinchilla era, using a fitted no-repetition scaling law to report Compute-Equivalent Gain and Compute-Equivalent Loss. We show that under this modernized paradigm, repetition damage is systematic in three ways. First, holding compute allocated to repeated data constant, eval loss peaks at an intermediate ","authors_text":"Bo He, David Donoho, Jessica Chudnovsky, Joshua Kazdan, Mehmet Donmez, Noam Levi, Rylan Schaeffer, Sanmi Koyejo, Yegor Denisov-Blanch","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-23T16:02:40Z","title":"Internal Data Repetition Destroys Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.24998","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:274d4dfc0856c9aeb4c0bc6ec0f782d392a87306b06e1c20cb083055b0bec51f","target":"record","created_at":"2026-06-25T00:17:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"01946e2fa0e7047c2ae67b60a0c81192c32dc99b4036e8586f173f4018aec15e","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-23T16:02:40Z","title_canon_sha256":"f315da6f562fb1d59436e1186c94297f650d9bd8b235c1536e56ee49241d251c"},"schema_version":"1.0","source":{"id":"2606.24998","kind":"arxiv","version":1}},"canonical_sha256":"8547c5ef59d17a059ecc9df88c8932541eb817378dfcf055f3946ab3bf6f7b87","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8547c5ef59d17a059ecc9df88c8932541eb817378dfcf055f3946ab3bf6f7b87","first_computed_at":"2026-06-25T00:17:47.875367Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-25T00:17:47.875367Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"QkLNXLQrz4ZW/9Caj7AzVynF9S7x8Up6vchOH9FKhJtzveVtaptWod+lnEDPRtc7D232uiojTkKUMncvgO0ZDw==","signature_status":"signed_v1","signed_at":"2026-06-25T00:17:47.875772Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.24998","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:274d4dfc0856c9aeb4c0bc6ec0f782d392a87306b06e1c20cb083055b0bec51f","sha256:aef23d8bb6662b199a34a6b2cef0a566dc924213133fd4bf60aebc1684d2be20"],"state_sha256":"1020e0b08a11ef0fae5c8d70c96fde76171bb14478a81be7e860bc01ce3bb04a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HKVbjR2REaEYbBMyPmaHRdYcR3mqAC3tWtkEArzIMO9NHXgufJK63Gmyz4t2KGUJaE5CnXZAwOMHIKSJC36eAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-27T01:42:51.189857Z","bundle_sha256":"4c8ff68f3777abd5703f409a37562786b53626e1e6a7a137a93156a963cbe5fb"}}