{"work":{"id":"52b3c9a6-2a27-45a7-ba2b-ebe4b5bb5a5f","openalex_id":null,"doi":null,"arxiv_id":"2006.16668","raw_key":null,"title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding","authors":null,"authors_text":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang","year":2020,"venue":"cs.CL","abstract":"Neural network scaling has been critical for improving the model quality in many real-world machine learning applications with vast amounts of training data and compute. Although this trend of scaling is affirmed to be a sure-fire approach for better model quality, there are challenges on the path such as the computation cost, ease of programming, and efficient implementation on parallel devices. GShard is a module composed of a set of lightweight annotation APIs and an extension to the XLA compiler. It provides an elegant way to express a wide range of parallel computation patterns with minimal changes to the existing model code. GShard enabled us to scale up multilingual neural machine translation Transformer model with Sparsely-Gated Mixture-of-Experts beyond 600 billion parameters using automatic sharding. We demonstrate that such a giant model can efficiently be trained on 2048 TPU v3 accelerators in 4 days to achieve far superior quality for translation from 100 languages to English compared to the prior art.","external_url":"https://arxiv.org/abs/2006.16668","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-15T02:39:33.288183+00:00","pith_arxiv_id":"2006.16668","created_at":"2026-05-08T17:13:38.690845+00:00","updated_at":"2026-05-15T02:39:33.288183+00:00","title_quality_ok":true,"display_title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding","render_title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding"},"hub":{"state":{"work_id":"52b3c9a6-2a27-45a7-ba2b-ebe4b5bb5a5f","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":57,"external_cited_by_count":null,"distinct_field_count":9,"first_pith_cited_at":"2020-12-31T19:00:10+00:00","last_pith_cited_at":"2026-05-14T06:33:41+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-15T04:07:31.051837+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":1},{"context_role":"dataset","n":1},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":1},{"context_polarity":"use_dataset","n":1},{"context_polarity":"use_method","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T12:20:41.534475+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer","work_id":"2c6b3f6d-54e4-4df7-baa7-475a490799af","shared_citers":32},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":23},{"title":"Mixtral of Experts","work_id":"0de8c352-9daa-4e1e-8c7b-3d0dec69f369","shared_citers":22},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":18},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":16},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":14},{"title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models","work_id":"a9888d6d-bf47-4324-9834-7cc12ac3a78c","shared_citers":13},{"title":"arXiv preprint arXiv:2408.15664 , year=","work_id":"267500ca-1512-478f-8a1b-6ecbdb09771d","shared_citers":8},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":8},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":7},{"title":"Kimi K2: Open Agentic Intelligence","work_id":"7f18284c-12d3-4137-bea1-1da97e8cf3c1","shared_citers":7},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":7},{"title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity","work_id":"f43c4955-a965-4897-a11b-c4b25d2aeaa8","shared_citers":7},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":7},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":7},{"title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","work_id":"1e1df141-cac8-47fd-b068-c4c96e51e331","shared_citers":6},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":6},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":6},{"title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models","work_id":"b7581741-3f43-4528-a7d0-3af9e51a4d9f","shared_citers":6},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":6},{"title":"arXiv preprint arXiv:1806.03377 , year=","work_id":"335ca03b-43f7-43d8-af32-3eaeb6735100","shared_citers":5},{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":5},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":5},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":5}],"time_series":[{"n":1,"year":2020},{"n":1,"year":2021},{"n":2,"year":2022},{"n":1,"year":2023},{"n":1,"year":2024},{"n":2,"year":2025},{"n":46,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T12:20:30.595830+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T12:20:34.178522+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding","claims":[{"claim_text":"Neural network scaling has been critical for improving the model quality in many real-world machine learning applications with vast amounts of training data and compute. Although this trend of scaling is affirmed to be a sure-fire approach for better model quality, there are challenges on the path such as the computation cost, ease of programming, and efficient implementation on parallel devices. GShard is a module composed of a set of lightweight annotation APIs and an extension to the XLA compiler. It provides an elegant way to express a wide range of parallel computation patterns with minim","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T12:20:38.381954+00:00"}},"summary":{"title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding","claims":[{"claim_text":"Neural network scaling has been critical for improving the model quality in many real-world machine learning applications with vast amounts of training data and compute. Although this trend of scaling is affirmed to be a sure-fire approach for better model quality, there are challenges on the path such as the computation cost, ease of programming, and efficient implementation on parallel devices. GShard is a module composed of a set of lightweight annotation APIs and an extension to the XLA compiler. It provides an elegant way to express a wide range of parallel computation patterns with minim","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer","work_id":"2c6b3f6d-54e4-4df7-baa7-475a490799af","shared_citers":32},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":23},{"title":"Mixtral of Experts","work_id":"0de8c352-9daa-4e1e-8c7b-3d0dec69f369","shared_citers":22},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":18},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":16},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":14},{"title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models","work_id":"a9888d6d-bf47-4324-9834-7cc12ac3a78c","shared_citers":13},{"title":"arXiv preprint arXiv:2408.15664 , year=","work_id":"267500ca-1512-478f-8a1b-6ecbdb09771d","shared_citers":8},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":8},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":7},{"title":"Kimi K2: Open Agentic Intelligence","work_id":"7f18284c-12d3-4137-bea1-1da97e8cf3c1","shared_citers":7},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":7},{"title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity","work_id":"f43c4955-a965-4897-a11b-c4b25d2aeaa8","shared_citers":7},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":7},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":7},{"title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","work_id":"1e1df141-cac8-47fd-b068-c4c96e51e331","shared_citers":6},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":6},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":6},{"title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models","work_id":"b7581741-3f43-4528-a7d0-3af9e51a4d9f","shared_citers":6},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":6},{"title":"arXiv preprint arXiv:1806.03377 , year=","work_id":"335ca03b-43f7-43d8-af32-3eaeb6735100","shared_citers":5},{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":5},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":5},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":5}],"time_series":[{"n":1,"year":2020},{"n":1,"year":2021},{"n":2,"year":2022},{"n":1,"year":2023},{"n":1,"year":2024},{"n":2,"year":2025},{"n":46,"year":2026}],"dependency_candidates":[]},"authors":[]}}