{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:GNKSV2C6QQIC3YO2F6U7UIF5QC","short_pith_number":"pith:GNKSV2C6","schema_version":"1.0","canonical_sha256":"33552ae85e84102de1da2fa9fa20bd80be0578a3e35f6450dfd39ed6050fbdcb","source":{"kind":"arxiv","id":"1811.06965","version":5},"attestation_state":"computed","paper":{"title":"GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Ankur Bapna, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Mia Xu Chen, Orhan Firat, Quoc V. Le, Yanping Huang, Yonghui Wu, Youlong Cheng, Zhifeng Chen","submitted_at":"2018-11-16T18:43:28Z","abstract_excerpt":"Scaling up deep neural network capacity has been known as an effective approach to improving model quality for several different machine learning tasks. In many cases, increasing model capacity beyond the memory limit of a single accelerator has required developing special algorithms or infrastructure. These solutions are often architecture-specific and do not transfer to other tasks. To address the need for efficient and task-independent model parallelism, we introduce GPipe, a pipeline parallelism library that allows scaling any network that can be expressed as a sequence of layers. By pipel"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.06965","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-11-16T18:43:28Z","cross_cats_sorted":[],"title_canon_sha256":"73caf0ac66a6aa17f025ba2c9550a3b5f977f262bb8852d0ac65767463dbd379","abstract_canon_sha256":"ef503a6378564ffaa6fb37481575251ffd1c7a99abb42d7a3f8f2253d87bba5d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:32.066513Z","signature_b64":"S/gHBLlpTSlqOW01k5Ph4p/N0A2tQ1BIFAnjoKOQIBvhTnMMJbgUxcMVonUlF2ia6UsTDkkVhHtx8NU0GBM1Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"33552ae85e84102de1da2fa9fa20bd80be0578a3e35f6450dfd39ed6050fbdcb","last_reissued_at":"2026-05-17T23:39:32.065902Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:32.065902Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Ankur Bapna, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Mia Xu Chen, Orhan Firat, Quoc V. Le, Yanping Huang, Yonghui Wu, Youlong Cheng, Zhifeng Chen","submitted_at":"2018-11-16T18:43:28Z","abstract_excerpt":"Scaling up deep neural network capacity has been known as an effective approach to improving model quality for several different machine learning tasks. In many cases, increasing model capacity beyond the memory limit of a single accelerator has required developing special algorithms or infrastructure. These solutions are often architecture-specific and do not transfer to other tasks. To address the need for efficient and task-independent model parallelism, we introduce GPipe, a pipeline parallelism library that allows scaling any network that can be expressed as a sequence of layers. By pipel"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.06965","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.06965","created_at":"2026-05-17T23:39:32.065998+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.06965v5","created_at":"2026-05-17T23:39:32.065998+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.06965","created_at":"2026-05-17T23:39:32.065998+00:00"},{"alias_kind":"pith_short_12","alias_value":"GNKSV2C6QQIC","created_at":"2026-05-18T12:32:25.280505+00:00"},{"alias_kind":"pith_short_16","alias_value":"GNKSV2C6QQIC3YO2","created_at":"2026-05-18T12:32:25.280505+00:00"},{"alias_kind":"pith_short_8","alias_value":"GNKSV2C6","created_at":"2026-05-18T12:32:25.280505+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":21,"internal_anchor_count":10,"sample":[{"citing_arxiv_id":"2605.18710","citing_title":"Mosaic: Towards Efficient Training of Multimodal Models with Spatial Resource Multiplexing","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18750","citing_title":"A Readiness-Driven Runtime for Pipeline-Parallel Training under Runtime Variability","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2508.21613","citing_title":"Chameleon: Adaptive Fault Tolerance for Distributed Training via Real-time Policy Selection","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2105.04663","citing_title":"GSPMD: General and Scalable Parallelization for ML Computation Graphs","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2510.15596","citing_title":"PRISM: Probabilistic Runtime Insights and Scalable Performance Modeling for Large-Scale Distributed Training","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"1910.04867","citing_title":"A Large-scale Study of Representation Learning with the Visual Task Adaptation Benchmark","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2010.01412","citing_title":"Sharpness-Aware Minimization for Efficiently Improving Generalization","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"1910.02054","citing_title":"ZeRO: Memory Optimizations Toward Training Trillion Parameter Models","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13434","citing_title":"Rescaled Asynchronous SGD: Optimal Distributed Optimization under Data and System Heterogeneity","ref_index":115,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02473","citing_title":"Analyzing Reverse Address Translation Overheads in Multi-GPU Scale-Up Pods","ref_index":48,"is_internal_anchor":false},{"citing_arxiv_id":"2208.07339","citing_title":"LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale","ref_index":72,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11111","citing_title":"ShardTensor: Domain Parallelism for Scientific Machine Learning","ref_index":60,"is_internal_anchor":false},{"citing_arxiv_id":"2309.14509","citing_title":"DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models","ref_index":19,"is_internal_anchor":false},{"citing_arxiv_id":"2604.27085","citing_title":"Efficient Training on Multiple Consumer GPUs with RoundPipe","ref_index":18,"is_internal_anchor":false},{"citing_arxiv_id":"1910.10683","citing_title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","ref_index":29,"is_internal_anchor":false},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":117,"is_internal_anchor":false},{"citing_arxiv_id":"1909.08053","citing_title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","ref_index":11,"is_internal_anchor":false},{"citing_arxiv_id":"2602.02276","citing_title":"Kimi K2.5: Visual Agentic Intelligence","ref_index":30,"is_internal_anchor":false},{"citing_arxiv_id":"2207.05221","citing_title":"Language Models (Mostly) Know What They Know","ref_index":175,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05049","citing_title":"Piper: Efficient Large-Scale MoE Training via Resource Modeling and Pipelined Hybrid Parallelism","ref_index":29,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC","json":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC.json","graph_json":"https://pith.science/api/pith-number/GNKSV2C6QQIC3YO2F6U7UIF5QC/graph.json","events_json":"https://pith.science/api/pith-number/GNKSV2C6QQIC3YO2F6U7UIF5QC/events.json","paper":"https://pith.science/paper/GNKSV2C6"},"agent_actions":{"view_html":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC","download_json":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC.json","view_paper":"https://pith.science/paper/GNKSV2C6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.06965&json=true","fetch_graph":"https://pith.science/api/pith-number/GNKSV2C6QQIC3YO2F6U7UIF5QC/graph.json","fetch_events":"https://pith.science/api/pith-number/GNKSV2C6QQIC3YO2F6U7UIF5QC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC/action/storage_attestation","attest_author":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC/action/author_attestation","sign_citation":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC/action/citation_signature","submit_replication":"https://pith.science/pith/GNKSV2C6QQIC3YO2F6U7UIF5QC/action/replication_record"}},"created_at":"2026-05-17T23:39:32.065998+00:00","updated_at":"2026-05-17T23:39:32.065998+00:00"}