{"total":22,"items":[{"citing_arxiv_id":"2606.30813","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Gradient Smoothing: Coupling Layer-wise Updates for Improved Optimization","primary_cat":"cs.LG","submitted_at":"2026-06-29T18:37:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Gradient Smoothing applies depth-wise smoothing to optimizer updates from base methods like Adam, yielding consistent gains in optimization and generalization on language, RL, diffusion, and vision tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30634","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"One-Step Gradient Delay is Not a Barrier for Large-Scale Asynchronous Pipeline Parallel LLM Pretraining","primary_cat":"cs.LG","submitted_at":"2026-06-29T17:57:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"One-step gradient delay is optimizer-dependent rather than intrinsically unstable, with Muon and error-feedback correction enabling async pipeline parallelism to match synchronous performance on models up to 10B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29176","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Dead-Direction Conditioners: Gauge-Equivariant Preconditioning for Deep Networks","primary_cat":"cs.LG","submitted_at":"2026-06-28T03:44:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Dead-Direction Conditioners provide gauge-equivariant preconditioning by conditioning optimizer state on symmetry orbits, yielding improved resistance to over-training collapse and higher detection of dead directions compared to AdamW and Muon.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27715","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aurora: A Leverage-Aware Spectral Optimizer","primary_cat":"cs.LG","submitted_at":"2026-06-26T04:47:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Aurora is a leverage-aware spectral optimizer that enforces uniform row norms in matrix updates while preserving Muon's polar geometry, outperforming Muon and achieving SOTA among spectral methods on modded-nanoGPT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09658","ref_index":73,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Muon Learns More Robust and Transferable Features than Adam","primary_cat":"cs.LG","submitted_at":"2026-06-08T15:42:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Muon learns more robust and transferable features than Adam and SGD, shown via corruption robustness tests, transfer experiments, layer-wise probes, effective rank measurements, and a theoretical proof on margins in a multi-component classification problem.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06418","ref_index":90,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Double Preconditioning (DoPr): Optimization for Test-Time Performance, not Validation Loss","primary_cat":"cs.LG","submitted_at":"2026-06-04T17:22:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Double preconditioning (DoPr) improves downstream task performance in test-time feedback settings without consistent gains in validation loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04662","ref_index":166,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Why Muon Outperforms Adam: A Curvature Perspective","primary_cat":"cs.LG","submitted_at":"2026-06-03T09:40:30+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Muon outperforms Adam by reducing curvature penalty via lower Normalized Directional Sharpness, as shown via Taylor approximation on LLM training and proven on stylized quadratic problems with heterogeneous curvature.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04058","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spectral Scaling Laws of Muon","primary_cat":"cs.LG","submitted_at":"2026-06-02T11:31:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Muon momentum matrices show layer-dependent power-law scaling of stabilized singular value quantiles with model size from 77M to 2.8B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23061","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Anytime Training with Schedule-Free Spectral Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-21T21:50:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SF-NorMuon is a new schedule-free spectral optimizer that closes the gap with tuned AdamW on 125M-772M parameter models across 1-8x Chinchilla horizons while providing stationarity guarantees.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21803","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Same Architecture, Different Capacity: Optimizer-Induced Spectral Scaling Laws","primary_cat":"cs.LG","submitted_at":"2026-05-20T23:00:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The same Transformer architecture follows different spectral scaling laws under different optimizers, with Muon achieving linear hard-rank scaling on tail representations while AdamW shows weak scaling, even when perplexity is matched.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20119","ref_index":24,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Toto 2.0: Time Series Forecasting Enters the Scaling Era","primary_cat":"cs.LG","submitted_at":"2026-05-19T17:08:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Time series foundation models scale under a single training recipe, with forecast quality improving from 4M to 2.5B parameters and new SOTA results on BOOM, GIFT-Eval, and TIME benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18528","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scale-Invariant Neural Network Optimization: Norm Geometry and Heavy-Tailed Noise","primary_cat":"math.OC","submitted_at":"2026-05-18T15:13:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Establishes matching Ω and O(min{m,n} ε^-(3p-2)/(p-1)) bounds for scale-invariant spectral-norm methods under heavy-tailed noise, plus an improved O(min{m,n} ε^-(5p-3)/(2p-2)) rate via transported Scion under Hessian Lipschitz continuity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18106","ref_index":99,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Symmetry-Compatible Principle for Optimizer Design: Embeddings, LM Heads, SwiGLU MLPs, and MoE Routers","primary_cat":"math.OC","submitted_at":"2026-05-18T09:17:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes equivariant optimizer updates matched to layer symmetries for embeddings, SwiGLU MLPs, and MoE routers, with reported gains in validation loss and training stability on several language model architectures.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"row-aware operations are allowed by the layer geometry. Examples.Several recent optimizers can be interpreted through this framework. SCALE [51] applies column normalization to the EMA momentum for LM heads; under our row-vocabulary convention, this corresponds to a row-norm-based update. Other row- or column-norm-based optimizers include RMNP [36] and REG [107]. Finally,NorMuon[ 99],Muon+ [ 170], and MuonEq[ 25], which apply row-wise and/or column-wise normalization to the orthogonal polar factor of the EMA momentum, can be viewed as hybrid spectral/row-norm optimizers. 3.4 Optimizers for SwiGLU MLP Projections We next consider SwiGLU MLP projection matrices [31, 135]. Unlike ordinary linear and attention projection matrices, SwiGLU projections do not possess full bi-orthogonal symmetry."},{"citing_arxiv_id":"2605.12492","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Pion: A Spectrum-Preserving Optimizer via Orthogonal Equivalence Transformation","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pion is an optimizer that preserves the singular values of weight matrices in LLM training by applying orthogonal equivalence transformations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[40] Mario Lezcano-Casado and David Martınez-Rubio. Cheap orthogonal constraints in neural networks: A simple parametrization of the orthogonal and unitary group. InICML, 2019. 2, 9 [41] Jun Li, Li Fuxin, and Sinisa Todorovic. Efficient riemannian optimization on the stiefel manifold via the cayley transform.arXiv preprint arXiv:2002.01113, 2020. 5 [42] Zichong Li, Liming Liu, Chen Liang, Weizhu Chen, and Tuo Zhao. Normuon: Making muon more efficient and scalable.arXiv preprint arXiv:2510.05491, 2025. 1, 9 [43] Rongmei Lin, Weiyang Liu, Zhen Liu, Chen Feng, Zhiding Yu, James M. Rehg, Li Xiong, and Le Song. Regularizing neural networks via minimizing hyperspherical energy. InCVPR, 2020. 1 [44] Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang Zhao,"},{"citing_arxiv_id":"2605.12491","ref_index":161,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Elastic Attention Cores for Scalable Vision Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:59:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VECA learns effective visual representations using core-periphery attention where patches interact exclusively via a resolution-invariant set of learned core embeddings, achieving linear O(N) complexity while maintaining competitive performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"parameterized by trainable commuting angle matrices. InProceedings of the Computer Vision and Pattern Recognition Conference, pages 4508-4517, 2025. [160] Keller Jordan, Yuchen Jin, Vlado Boza, Jiacheng You, Franz Cesista, Laker Newhouse, and Jeremy Bernstein. Muon: An optimizer for hidden layers in neural networks, 2024. URL https://kellerjordan.github.io/posts/muon/. [161] Zichong Li, Liming Liu, Chen Liang, Weizhu Chen, and Tuo Zhao. Normuon: Making muon more efficient and scalable.arXiv preprint arXiv:2510.05491, 2025. [162] Diederik P Kingma and Jimmy Ba. Adam: A method for stochastic optimization.arXiv preprint arXiv:1412.6980, 2014. [163] Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization.arXiv preprint"},{"citing_arxiv_id":"2605.11396","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MuonQ: Enhancing Low-Bit Muon Quantization via Directional Fidelity Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-12T01:31:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MuonQ achieves stable 4-bit quantization of Muon optimizer states via pre-quantization normalization, singular component decomposition with power iteration, and μ-law companding, matching full-precision loss and accuracy on GPT and LLaMA models with up to 7.3x memory savings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07067","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PolarAdamW: Disentangling Spectral Control and Schur Gauge-Equivariance in Matrix Optimisation","primary_cat":"cs.LG","submitted_at":"2026-05-08T00:19:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PolarAdamW disentangles spectral control from gauge-equivariance in matrix optimizers, with experiments demonstrating their distinct roles on standard versus symmetry-aware neural networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06615","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:32:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SignSGD provably beats SGD by a factor of d under sparse noise via matched ℓ1-norm upper and lower bounds, with an equivalent result for Muon on matrices, and this predicts faster GPT-2 pretraining.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"∥∇f(x t)∥1 # =O r ∆L∞ N + (∥σ∥1) 1 2 (∆L∞) 1 4 (N) 1 4 ! . This completes the proof. B.2 Lower Bound for SignSGD To exploit the separable nature of SignSGD before generalizing tod dimensions, we start by bounding the complexity of Algorithm 1 in finding stationary points in one dimension: Lemma 2.For any positive integern, suppose thatϵsatisfies ϵ≤ 1 2√n (20) Let x1 ∈R and xt =x 1 + (t−1)η for any 2≤t≤n+ 1 . Then there exists a function p:R→R such that: (i) p has a 1-Lipschitz gradient; (ii) p(x1)−infp≤1 (iii) p′(xt) =−ϵ for any t∈[1, n] . Proof.We construct the function p as: p(x) =    −ϵ(x−x 1)x∈(−∞, x 1]; ϕt,ϵ(x) +c t x∈(x t, xt+1]; 1 2(x−x t+1)2 −ϵ(x−x t+1) +c N+1 x∈(x n+1,∞), where the values {ct}n"},{"citing_arxiv_id":"2603.28254","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MuonEq: Balancing Before Orthogonalization with Lightweight Equilibration","primary_cat":"cs.LG","submitted_at":"2026-03-30T10:28:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MuonEq introduces pre-orthogonalization equilibration schemes that improve Muon optimizer performance during large language model pretraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.20527","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RMNP: Row-Momentum Normalized Preconditioning for Scalable Matrix-Based Optimization","primary_cat":"cs.LG","submitted_at":"2026-03-20T21:55:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RMNP preconditions matrix updates via row-wise L2 normalization instead of Newton-Schulz iteration, reducing complexity to O(mn) while matching Muon's non-convex convergence rate and empirical performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.10067","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HTMuon: Improving Muon via Heavy-Tailed Spectral Correction","primary_cat":"cs.LG","submitted_at":"2026-03-10T02:12:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HTMuon modifies Muon to produce heavier-tailed updates and weight spectra via HT-SR theory, yielding up to 0.98 lower perplexity on LLaMA pretraining and serving as a plug-in for other Muon variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.21545","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MUON+: Towards More Effective Muon via One Additional Normalization Step for LLM Pre-training","primary_cat":"cs.LG","submitted_at":"2026-02-25T04:04:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Muon+ adds one normalization step after polar orthogonalization in the Muon optimizer, yielding lower training and validation perplexity and faster pre-training across 60M-7B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}