{"work":{"id":"1ac90585-1330-4f90-8836-6382fa63c4eb","openalex_id":null,"doi":null,"arxiv_id":"2406.17557","raw_key":null,"title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale","authors":null,"authors_text":"Guilherme Penedo, Hynek Kydl\\'i\\v{c}ek, Loubna Ben allal, Anton Lozhkov, Margaret Mitchell, Colin Raffel","year":2024,"venue":"cs.CL","abstract":"The performance of a large language model (LLM) depends heavily on the quality and size of its pretraining dataset. However, the pretraining datasets for state-of-the-art open LLMs like Llama 3 and Mixtral are not publicly available and very little is known about how they were created. In this work, we introduce FineWeb, a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. To advance the understanding of how best to curate high-quality pretraining datasets, we carefully document and ablate all of the design choices used in FineWeb, including in-depth investigations of deduplication and filtering strategies. In addition, we introduce FineWeb-Edu, a 1.3-trillion token collection of educational text filtered from FineWeb. LLMs pretrained on FineWeb-Edu exhibit dramatically better performance on knowledge- and reasoning-intensive benchmarks like MMLU and ARC. Along with our datasets, we publicly release our data curation codebase and all of the models trained during our ablation experiments.","external_url":"https://arxiv.org/abs/2406.17557","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-14T20:19:27.378405+00:00","pith_arxiv_id":"2406.17557","created_at":"2026-05-08T23:39:24.027703+00:00","updated_at":"2026-05-14T20:19:27.378405+00:00","title_quality_ok":true,"display_title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale","render_title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale"},"hub":{"state":{"work_id":"1ac90585-1330-4f90-8836-6382fa63c4eb","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":30,"external_cited_by_count":null,"distinct_field_count":7,"first_pith_cited_at":"2023-03-31T17:28:46+00:00","last_pith_cited_at":"2026-05-13T09:17:51+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-14T20:46:11.861124+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":1}],"polarity_counts":[{"context_polarity":"background","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}