{"total":11,"items":[{"citing_arxiv_id":"2605.13434","ref_index":116,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rescaled Asynchronous SGD: Optimal Distributed Optimization under Data and System Heterogeneity","primary_cat":"cs.LG","submitted_at":"2026-05-13T12:27:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rescaled ASGD recovers convergence to the true global objective by rescaling worker stepsizes proportional to computation times, matching the known time lower bound in the leading term under non-convex smoothness and bounded heterogeneity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27085","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Efficient Training on Multiple Consumer GPUs with RoundPipe","primary_cat":"cs.DC","submitted_at":"2026-04-29T18:26:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"RoundPipe achieves near-zero-bubble pipeline parallelism for LLM training on consumer GPUs by dynamically dispatching computation stages round-robin, yielding 1.48-2.16x speedups and enabling 235B model fine-tuning on 8x RTX 4090.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.20534","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Kimi K2: Open Agentic Intelligence","primary_cat":"cs.LG","submitted_at":"2025-07-28T05:35:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kimi K2 is a 1-trillion-parameter MoE model that leads open-source non-thinking models on agentic benchmarks including 65.8 on SWE-Bench Verified and 66.1 on Tau2-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.06066","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models","primary_cat":"cs.CL","submitted_at":"2024-01-11T17:31:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeepSeekMoE 2B matches GShard 2.9B performance and approaches a dense 2B model; the 16B version matches LLaMA2-7B at 40% compute by using fine-grained expert segmentation plus shared experts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.14509","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models","primary_cat":"cs.LG","submitted_at":"2023-09-25T20:15:57+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepSpeed-Ulysses keeps communication volume constant for sequence-parallel attention when sequence length and device count scale together, delivering 2.5x faster training on 4x longer sequences than prior SOTA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2304.11277","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel","primary_cat":"cs.DC","submitted_at":"2023-04-21T23:52:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PyTorch Fully Sharded Data Parallel enables training of significantly larger models than Distributed Data Parallel with comparable speed and near-linear TFLOPS scaling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2208.07339","ref_index":73,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale","primary_cat":"cs.LG","submitted_at":"2022-08-15T17:08:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM.int8() performs 8-bit inference for transformers up to 175B parameters with no accuracy loss by combining vector-wise quantization for most features with 16-bit mixed-precision handling of systematic outlier dimensions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2202.08906","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models","primary_cat":"cs.CL","submitted_at":"2022-02-17T21:39:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ST-MoE introduces stability techniques for sparse expert models, allowing a 269B-parameter model to achieve state-of-the-art transfer learning results across reasoning, summarization, and QA tasks at the compute cost of a 32B dense model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2101.03961","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity","primary_cat":"cs.LG","submitted_at":"2021-01-11T16:11:52+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Switch Transformers use top-1 expert routing in a Mixture of Experts setup to scale to trillion-parameter language models with constant compute and up to 4x speedup over T5-XXL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2006.16668","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding","primary_cat":"cs.CL","submitted_at":"2020-06-30T10:42:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GShard supplies automatic sharding and conditional computation support that enabled training a 600-billion-parameter multilingual translation model on thousands of TPUs with superior quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1909.08053","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","primary_cat":"cs.CL","submitted_at":"2019-09-17T19:42:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Intra-layer model parallelism in PyTorch enables training of 8.3B-parameter transformers, achieving SOTA perplexity of 10.8 on WikiText103 and 66.5% accuracy on LAMBADA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}