{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:A536HLN7UJUF763LWO53KFKHUV","short_pith_number":"pith:A536HLN7","schema_version":"1.0","canonical_sha256":"0777e3adbfa2685ffb6bb3bbb51547a555b8cee1800b2893176cd77160efda46","source":{"kind":"arxiv","id":"1911.02116","version":2},"attestation_state":"computed","paper":{"title":"Unsupervised Cross-lingual Representation Learning at Scale","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Pretraining multilingual language models on 100 languages with over two terabytes of data leads to large gains on cross-lingual benchmarks.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Alexis Conneau, Edouard Grave, Francisco Guzm\\'an, Guillaume Wenzek, Kartikay Khandelwal, Luke Zettlemoyer, Myle Ott, Naman Goyal, Veselin Stoyanov, Vishrav Chaudhary","submitted_at":"2019-11-05T22:42:00Z","abstract_excerpt":"This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6% average accuracy on XNLI, +13% average F1 score on MLQA, and +2.4% F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7% in XNL"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"1911.02116","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-11-05T22:42:00Z","cross_cats_sorted":[],"title_canon_sha256":"f1c1e325d47d6ee88301d33ccbf5082b8804a1f894c8786e31e3003ca0f104c5","abstract_canon_sha256":"fbed3f020eb6d7cdcebb15ee2da04eef8c1db21877b60207edcbbd0b72267088"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.315954Z","signature_b64":"97Gr+OMQXw9x7d3vjD9Y+VEp3uGnyWLQP3WtH2hDNyUn8y0ASXc68mfuztz3r7vQiwOecsqt1kkd0fsUopFeAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0777e3adbfa2685ffb6bb3bbb51547a555b8cee1800b2893176cd77160efda46","last_reissued_at":"2026-05-17T23:38:47.315378Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.315378Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Unsupervised Cross-lingual Representation Learning at Scale","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Pretraining multilingual language models on 100 languages with over two terabytes of data leads to large gains on cross-lingual benchmarks.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Alexis Conneau, Edouard Grave, Francisco Guzm\\'an, Guillaume Wenzek, Kartikay Khandelwal, Luke Zettlemoyer, Myle Ott, Naman Goyal, Veselin Stoyanov, Vishrav Chaudhary","submitted_at":"2019-11-05T22:42:00Z","abstract_excerpt":"This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6% average accuracy on XNLI, +13% average F1 score on MLQA, and +2.4% F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7% in XNL"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the observed gains are caused by the increased scale of pretraining data and languages rather than by differences in data filtering, hyperparameter choices, or evaluation protocol details not visible in the abstract.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"XLM-R, pretrained on 100 languages with 2TB of CommonCrawl data, improves average XNLI accuracy by 14.6 points and MLQA F1 by 13 points over mBERT while matching strong monolingual models on GLUE.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Pretraining multilingual language models on 100 languages with over two terabytes of data leads to large gains on cross-lingual benchmarks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ef67d3374724e211b0fc5e8581c310acceec5c663fec28ffa6b47a67a67b4027"},"source":{"id":"1911.02116","kind":"arxiv","version":2},"verdict":{"id":"5d44db8b-ea6a-41b0-a378-480aa65173b2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T16:18:54.635933Z","strongest_claim":"This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks.","one_line_summary":"XLM-R, pretrained on 100 languages with 2TB of CommonCrawl data, improves average XNLI accuracy by 14.6 points and MLQA F1 by 13 points over mBERT while matching strong monolingual models on GLUE.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the observed gains are caused by the increased scale of pretraining data and languages rather than by differences in data filtering, hyperparameter choices, or evaluation protocol details not visible in the abstract.","pith_extraction_headline":"Pretraining multilingual language models on 100 languages with over two terabytes of data leads to large gains on cross-lingual benchmarks."},"references":{"count":12,"sample":[{"doi":"","year":1907,"title":"Massively multilingual neural machine translation in the wild: Findings and challenges","work_id":"1f743ee4-68c2-4ada-b981-6f62054e2525","ref_index":1,"cited_arxiv_id":"1907.05019","is_internal_anchor":true},{"doi":"","year":2017,"title":"Bag of tricks for efﬁcient text classiﬁcation.EACL 2017, page","work_id":"194fdea9-8a27-4838-8c7b-97928d0f3ecb","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Exploring the limits of language modeling","work_id":"a9dbcb7a-e48d-42a4-8d60-a8f723751a97","ref_index":3,"cited_arxiv_id":"1602.02410","is_internal_anchor":true},{"doi":"","year":1910,"title":"arXiv preprint arXiv:1910.07475","work_id":"da4ff338-4ef3-4d81-bf0d-cb5b178e77df","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1907,"title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","work_id":"41fe12c4-e538-4890-a244-480650ed3078","ref_index":5,"cited_arxiv_id":"1907.11692","is_internal_anchor":true}],"resolved_work":12,"snapshot_sha256":"15389c1c0e33c1bb9a3d62030e84c68491ebec291274483c3db97663d92cfc21","internal_anchors":6},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1911.02116","created_at":"2026-05-17T23:38:47.315466+00:00"},{"alias_kind":"arxiv_version","alias_value":"1911.02116v2","created_at":"2026-05-17T23:38:47.315466+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1911.02116","created_at":"2026-05-17T23:38:47.315466+00:00"},{"alias_kind":"pith_short_12","alias_value":"A536HLN7UJUF","created_at":"2026-05-18T12:33:12.712433+00:00"},{"alias_kind":"pith_short_16","alias_value":"A536HLN7UJUF763L","created_at":"2026-05-18T12:33:12.712433+00:00"},{"alias_kind":"pith_short_8","alias_value":"A536HLN7","created_at":"2026-05-18T12:33:12.712433+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":36,"internal_anchor_count":36,"sample":[{"citing_arxiv_id":"2606.18389","citing_title":"Want Better Synthetic Data? Steer It: Activation Steering for Low-Resource Language Generation","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2607.02259","citing_title":"BamiBERT: A New BERT-based Language Model for Vietnamese","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2606.07313","citing_title":"SV-Detect: AI-generated Text Detection with Steering Vectors","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2606.01671","citing_title":"When Meaning Travels: A Granular Lens on Hybrid-MoE's Role in Idiomatic Understanding for Language Models","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2605.24718","citing_title":"The Tokenizer Tax Across 25 European Languages: Domain Invariance, Cross-Lingual Few-Shot Effects, and the Ukrainian Penalty","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2606.16009","citing_title":"Bridging the Usability Gap: Lessons from Interpreting Studies for Machine Interpreting Design","ref_index":129,"is_internal_anchor":true},{"citing_arxiv_id":"2409.11022","citing_title":"DynamicNER: A Dynamic, Multilingual, and Fine-Grained Dataset for LLM-based Named Entity Recognition","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2411.05527","citing_title":"How Good is Your Wikipedia? Auditing Data Quality for Low-resource and Multilingual NLP","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2412.04497","citing_title":"Opportunities and Challenges of Large Language Models for Low-Resource Languages in Humanities Research","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2412.20760","citing_title":"Attributing Culture-Conditioned Generations to Pretraining Corpora","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2502.00414","citing_title":"Social media polarization during conflict: Insights from an ideological stance dataset on Israel-Palestine Reddit comments","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2502.15972","citing_title":"When Cultures Meet: Multicultural Text-to-Image Generation","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2602.16608","citing_title":"Explainable AI: Context-Aware Layer-Wise Integrated Gradients for Explaining Transformer Models","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19274","citing_title":"Lost in Interpretation: The Plausibility-Faithfulness Trade-off in Cross-Lingual Explanations","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2508.01486","citing_title":"Human-Centered Supervision for Sentiment Analysis in Telugu: A Systematic Inquiry Beyond Accuracy","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2509.16621","citing_title":"The Role of Vocabularies in Learning Sparse Representations for Ranking","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2510.14274","citing_title":"Retrofitting Small Multilingual Models for Retrieval: Matching 7B Performance with 300M Parameters","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2305.16264","citing_title":"Scaling Data-Constrained Language Models","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08549","citing_title":"VerifAI: A Verifiable Open-Source Search Engine for Biomedical Question Answering","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2102.04664","citing_title":"CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14503","citing_title":"Not All RAGs Are Created Equal: A Component-Wise Empirical Study for Software Engineering Tasks","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13225","citing_title":"Mix, Don't Tune: Bilingual Pre-Training Outperforms Hyperparameter Search in Data-Constrained Settings","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03673","citing_title":"'Layer su Layer': Identifying and Disambiguating the Italian NPN Construction in BERT's family","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2406.17557","citing_title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2505.06111","citing_title":"UniVLA: Learning to Act Anywhere with Task-centric Latent Actions","ref_index":18,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV","json":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV.json","graph_json":"https://pith.science/api/pith-number/A536HLN7UJUF763LWO53KFKHUV/graph.json","events_json":"https://pith.science/api/pith-number/A536HLN7UJUF763LWO53KFKHUV/events.json","paper":"https://pith.science/paper/A536HLN7"},"agent_actions":{"view_html":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV","download_json":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV.json","view_paper":"https://pith.science/paper/A536HLN7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1911.02116&json=true","fetch_graph":"https://pith.science/api/pith-number/A536HLN7UJUF763LWO53KFKHUV/graph.json","fetch_events":"https://pith.science/api/pith-number/A536HLN7UJUF763LWO53KFKHUV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV/action/storage_attestation","attest_author":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV/action/author_attestation","sign_citation":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV/action/citation_signature","submit_replication":"https://pith.science/pith/A536HLN7UJUF763LWO53KFKHUV/action/replication_record"}},"created_at":"2026-05-17T23:38:47.315466+00:00","updated_at":"2026-05-17T23:38:47.315466+00:00"}