{"total":22,"items":[{"citing_arxiv_id":"2606.27715","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aurora: A Leverage-Aware Spectral Optimizer","primary_cat":"cs.LG","submitted_at":"2026-06-26T04:47:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Aurora is a leverage-aware spectral optimizer that enforces uniform row norms in matrix updates while preserving Muon's polar geometry, outperforming Muon and achieving SOTA among spectral methods on modded-nanoGPT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08783","ref_index":193,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OptMuon: Closed-Loop Orthogonalized Momentum Methods for Stochastic Optimization with Zero-Noise Optimality","primary_cat":"math.OC","submitted_at":"2026-06-07T18:59:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OptMuon combines orthogonalized momentum with trajectory-dependent AdaGrad-Norm adaptation to obtain expected-stationarity rates of order T^{-1/2} + sigma^{1/2}T^{-1/4} or T^{-1/2} + sigma^{1/3}T^{-1/3} that reduce to near-optimal deterministic first-order rates in the zero-noise regime.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27733","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Can Entry-Wise Clipping Give Spectral Control of Stochastic Gradients?","primary_cat":"cs.LG","submitted_at":"2026-05-26T22:12:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Entry-wise clipping achieves spectral control of gradients via localization under heavy-tailed contamination, with O(ε^{-4}) convergence and empirical savings on NanoGPT pretraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26977","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Convergence of Spectral Descent for Non-smooth Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-26T13:02:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proves linear convergence of Spectral Descent (SD) and Truncated SD for non-smooth convex problems under stated conditions, sublinear rates for regularized versions via Frank-Wolfe, and recovery guarantees for robust low-rank matrix recovery.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26842","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MONA: Muon Optimizer with Nesterov Acceleration for Scalable Language Model Training","primary_cat":"cs.LG","submitted_at":"2026-05-26T10:56:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MONA integrates Nesterov acceleration into Muon's orthogonalization framework, reporting better convergence than Muon and AdamW on MoE models up to 68B parameters trained on 1T tokens and SOTA fine-tuning results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23061","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Anytime Training with Schedule-Free Spectral Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-21T21:50:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SF-NorMuon is a new schedule-free spectral optimizer that closes the gap with tuned AdamW on 125M-772M parameter models across 1-8x Chinchilla horizons while providing stationarity guarantees.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22432","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AMUSE: Anytime Muon with Stable Gradient Evaluation","primary_cat":"cs.LG","submitted_at":"2026-05-21T12:55:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AMUSE is a new optimizer integrating Muon orthogonalization with Schedule-Free averaging via adaptive interpolation for schedule-free anytime training that improves Pareto frontiers on vision and LLM tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19282","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethinking Muon Beyond Pretraining: Spectral Failures and High-Pass Remedies for VLA and RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-19T03:00:26+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pion modifies Muon's Newton-Schulz iterations into a controllable high-pass filter that anchors dominant singular values at 1 while suppressing noisy tails, outperforming Muon and AdamW in VLA and RLVR regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18106","ref_index":140,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Symmetry-Compatible Principle for Optimizer Design: Embeddings, LM Heads, SwiGLU MLPs, and MoE Routers","primary_cat":"math.OC","submitted_at":"2026-05-18T09:17:26+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"On the convergence analysis of Muon.arXiv preprint arXiv:2505.23737, 2025. [139] H.-J. M. Shi, T.-H. Lee, S. Iwasaki, J. Gallego-Posada, Z. Li, K. Rangadurai, D. Mudigere, and M. Rabbat. A distributed data-parallel PyTorch implementation of the distributed Shampoo optimizer for training neural networks at-scale.arXiv preprint arXiv:2309.06497, 2023. [140] C. Si, D. Zhang, and W. Shen. AdaMuon: Adaptive Muon optimizer.arXiv preprint arXiv:2507.11005, 2025. [141] V. Singh, L. Krauss, S. Jaghouar, M. Sirovatka, C. Goddard, F. Obied, J. M. Ong, J. Straube, Fern, et al. Arcee Trinity Large technical report.arXiv preprint arXiv:2602.17004, 2026. [142] StepFun Team, A. Huang, A. Li, A. Kong, B. Wang, B. Jiao, B."},{"citing_arxiv_id":"2605.13079","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spectral Flattening Is All Muon Needs: How Orthogonalization Controls Learning Rate and Convergence","primary_cat":"cs.LG","submitted_at":"2026-05-13T06:54:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Muon achieves faster convergence and larger stable learning rates by flattening the singular value spectrum of the momentum buffer through orthogonalization, scaling step size with average rather than maximum singular values.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11181","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Muon is Not That Special: Random or Inverted Spectra Work Just as Well","primary_cat":"cs.LG","submitted_at":"2026-05-11T19:42:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Muon succeeds by guaranteeing local step-size optimality rather than by tracking any ideal global geometry, as random-spectrum and quasi-norm variants match its performance on language models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"1 Fundamental Properties of thea/b-Method We aim in this section to generalise the results in Amsel et al. [2026] to our setup. To this end, let r, p∈Nand consider the following subsets of the polynomial ringP Podd(r) := span{x2ir+1 :i∈N},(10) Peven(r) := span{x2ir :i∈N},(11) Podd p (r) :={P∈P odd(r) : deg(P)≤2rp+ 1},(12) Peven p (r) :={P∈P even(r) : deg(P)≤2rp}.(13) Moreover, for anyn, d∈N, we also define the following sets of rational functions R(r, s) := \u001a N D :N∈P odd(r), D∈P even(s), D >0 \u001b ,(14) Rn,d(r, s) := \u001a N D :N∈P odd n (r), D∈P even d (s), D >0 \u001b .(15) E.1.1 Existence and Characterization of the Optimal Approximants. The set R(r, s) considered above can be understood as a particular subset of generalized rational"},{"citing_arxiv_id":"2605.07815","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OrScale: Orthogonalised Optimization with Layer-Wise Trust-Ratio Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-08T14:47:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OrScale adds a Frobenius-norm trust-ratio layer-wise scaler to Muon’s orthogonalized updates, with per-layer calibration for language models, yielding higher CIFAR-10 accuracy and better language-model pre-training loss than Muon+Moonlight and AdamW.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[13] Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language models are unsupervised multitask learners.OpenAI, 2019. URL https://cdn.openai.com/better-language-models/language_models_are_ unsupervised_multitask_learners.pdf. Accessed: 2024-11-15. [14] Chongjie Si, Debing Zhang, and Wei Shen. Adamuon: Adaptive muon optimizer, 2025. URL https://arxiv.org/abs/2507.11005. [15] Ilya Sutskever, James Martens, George Dahl, and Geoffrey Hinton. On the importance of initialization and momentum in deep learning. In Sanjoy Dasgupta and David McAllester, editors,Proceedings of the 30th International Conference on Machine Learning, volume 28 of Proceedings of Machine Learning Research, pages 1139-1147, Atlanta, Georgia, USA, 17-19"},{"citing_arxiv_id":"2605.07067","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PolarAdamW: Disentangling Spectral Control and Schur Gauge-Equivariance in Matrix Optimisation","primary_cat":"cs.LG","submitted_at":"2026-05-08T00:19:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PolarAdamW disentangles spectral control from gauge-equivariance in matrix optimizers, with experiments demonstrating their distinct roles on standard versus symmetry-aware neural networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06615","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:32:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SignSGD provably beats SGD by a factor of d under sparse noise via matched ℓ1-norm upper and lower bounds, with an equivalent result for Muon on matrices, and this predicts faster GPT-2 pretraining.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"introduce a projection matrix P= [I m,0 m×(n−m)]∈R m×n which pads an m×m matrix with zeros to match the m×n dimension. Let Σ∈R m×m be the target matrix noise covariance, which can be diagonalized via an orthogonal matrixQ∈R m×m asΣ=Q ⊤ diag(σ)Q. We construct our worst-case matrix objective function by extracting the diagonal elements of the transformed matrixQWP ⊤ ∈R m×m: F(W) = mX i=1 fi (QWP⊤)ii \u0001 ,(33) wheref(x) = Pm i=1 fi(xi)is the hard 1D vector instance constructed in Theorem 2. First, we provide two lemmas to establish the strict geometric equivalence between the vector domain and the matrix domain. Lemma 6.Assume that the separable vector function f(x) :R m →R satisfies Assumption 2a with bounded variance σ2. Then the constructed matrix function F(W) :R m×n →R satisfies"},{"citing_arxiv_id":"2605.05577","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Accelerating LMO-Based Optimization via Implicit Gradient Transport","primary_cat":"cs.LG","submitted_at":"2026-05-07T01:44:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LMO-IGT achieves O(ε^{-3.5}) iteration complexity for stochastic LMO optimization via implicit gradient transport with a single gradient per step and introduces the regularized support function as a unified stationarity measure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17423","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A unified convergence theory for adaptive first-order methods in the nonconvex case, including AdaNorm, full and diagonal AdaGrad, Shampoo and Muo","primary_cat":"cs.LG","submitted_at":"2026-04-19T13:07:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A unified stochastic convergence theory is developed for adaptive preconditioned first-order methods including AdaGrad variants, Shampoo, and Muon in nonconvex optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12946","ref_index":73,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Parcae: Scaling Laws For Stable Looped Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-14T16:43:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Parcae stabilizes looped LLMs via spectral norm constraints on injection parameters, enabling power-law scaling for training FLOPs and saturating exponential scaling at test time that improves quality over fixed-depth baselines under fixed parameter budgets.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Aitor Lewkowycz, Akshat Agarwal, Alethea Power, Alex Ray, Alex Warstadt, Alexander W. Kocurek, Ali Safaya, Ali Tazarv, Alice Xiang, Alicia Parrish, Allen Nie, Aman Hussain, Amanda Askell, and Amanda Dsouza et al. Beyond the imitation game: Quantifying and extrapolating the capabilities of language models, 2023. URLhttps://arxiv.org/abs/2206.04615. [73] Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. Roformer: Enhanced transformer with rotary position embedding, 2023. URL https://arxiv.org/abs/ 2104.09864. [74] Sho Takase, Shun Kiyono, Sosuke Kobayashi, and Jun Suzuki. Spike no more: Stabilizing the pre-training of large language models, 2025. URLhttps://arxiv.org/abs/2312."},{"citing_arxiv_id":"2603.28254","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MuonEq: Balancing Before Orthogonalization with Lightweight Equilibration","primary_cat":"cs.LG","submitted_at":"2026-03-30T10:28:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MuonEq introduces pre-orthogonalization equilibration schemes that improve Muon optimizer performance during large language model pretraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.20527","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RMNP: Row-Momentum Normalized Preconditioning for Scalable Matrix-Based Optimization","primary_cat":"cs.LG","submitted_at":"2026-03-20T21:55:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RMNP preconditions matrix updates via row-wise L2 normalization instead of Newton-Schulz iteration, reducing complexity to O(mn) while matching Muon's non-convex convergence rate and empirical performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.10067","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HTMuon: Improving Muon via Heavy-Tailed Spectral Correction","primary_cat":"cs.LG","submitted_at":"2026-03-10T02:12:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HTMuon modifies Muon to produce heavier-tailed updates and weight spectra via HT-SR theory, yielding up to 0.98 lower perplexity on LLaMA pretraining and serving as a plug-in for other Muon variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.15816","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"On the Convergence of Muon and Beyond","primary_cat":"cs.LG","submitted_at":"2025-09-19T09:43:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Muon-MVR2 attains the optimal anytime convergence rate of ~O(T^{-1/3}) in stochastic non-convex settings under horizon-free schedules.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.11983","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Low-rank Orthogonalization for Large-scale Matrix Optimization with Applications to Foundation Model Training","primary_cat":"cs.LG","submitted_at":"2025-09-15T14:28:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes low-rank orthogonalization and derives low-rank Muon and MSGD variants that outperform standard Muon on GPT-2 and LLaMA pretraining while providing iteration complexity bounds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}