{"total":32,"items":[{"citing_arxiv_id":"2606.09658","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Muon Learns More Robust and Transferable Features than Adam","primary_cat":"cs.LG","submitted_at":"2026-06-08T15:42:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Muon learns more robust and transferable features than Adam and SGD, shown via corruption robustness tests, transfer experiments, layer-wise probes, effective rank measurements, and a theoretical proof on margins in a multi-component classification problem.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08574","ref_index":47,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OrderDP: A Theoretically Guaranteed Lossless Dynamic Data Pruning Framework","primary_cat":"cs.LG","submitted_at":"2026-06-07T11:11:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OrderDP is a plug-and-play data pruning method that selects a random subset then top-q samples to guarantee unbiased surrogate-loss training with convergence analysis and over 40% training cost reduction on CIFAR and ImageNet.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04662","ref_index":102,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Muon Outperforms Adam: A Curvature Perspective","primary_cat":"cs.LG","submitted_at":"2026-06-03T09:40:30+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Muon outperforms Adam by reducing curvature penalty via lower Normalized Directional Sharpness, as shown via Taylor approximation on LLM training and proven on stylized quadratic problems with heterogeneous curvature.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02100","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PortBERT: Navigating the Depths of Portuguese Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-01T11:32:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"PortBERT releases two RoBERTa models for Portuguese that match or beat prior monolingual and multilingual models on translated GLUE/SuperGLUE tasks while reporting training and inference times.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26842","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MONA: Muon Optimizer with Nesterov Acceleration for Scalable Language Model Training","primary_cat":"cs.LG","submitted_at":"2026-05-26T10:56:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MONA integrates Nesterov acceleration into Muon's orthogonalization framework, reporting better convergence than Muon and AdamW on MoE models up to 68B parameters trained on 1T tokens and SOTA fine-tuning results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22997","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scene Reconstruction as Mapping Priors for 3D Detection","primary_cat":"cs.CV","submitted_at":"2026-05-21T19:52:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Automatically constructed mapping priors from sensor aggregation are integrated via the MPA3D framework to achieve state-of-the-art 3D detection results on the Waymo Open Dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22297","ref_index":17,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One LR Doesn't Fit All: Heavy-Tail Guided Layerwise Learning Rates for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-21T10:46:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLR uses heavy-tailed self-regularization theory to set per-layer learning rates in Transformers, yielding faster convergence and higher zero-shot accuracy than uniform rates across model scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22098","ref_index":71,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TextTeacher: What Can Language Teach About Images?","primary_cat":"cs.CV","submitted_at":"2026-05-21T07:36:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TextTeacher uses frozen text embeddings from captions as semantic anchors to guide vision model training, improving ImageNet accuracy by up to 2.7 p.p. and transfer performance by 1.0 p.p. on average.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21557","ref_index":19,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scalable Reinforcement Learning via Adaptive Batch Scaling","primary_cat":"stat.ML","submitted_at":"2026-05-20T13:46:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ABS uses Behavioral Divergence to adaptively scale batch sizes in RL according to policy volatility, enabling effective large-batch large-network training on ALE benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20390","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"STELLAR: Scaling 3D Perception Large Models for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-19T18:40:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"STELLAR trains up to 500M-parameter multi-modal models on 50M driving scenes and reports empirical scaling trends plus new state-of-the-art results on the Waymo Open Dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16600","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Where Pretraining writes and Alignment reads: the asymmetry of Transformer weight space","primary_cat":"cs.LG","submitted_at":"2026-05-15T20:00:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pretraining and alignment induce asymmetric geometric traces in transformer weights because alignment updates concentrate in read pathways due to activation covariance while write pathways inherit less structure from alignment losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15435","ref_index":45,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Stability of Growth in Structural Plasticity","primary_cat":"cs.LG","submitted_at":"2026-05-14T21:27:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Growth during training inserts new units into a specialized trajectory, making them forward-active but backward-starved with weaker gradients than existing units.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11111","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ShardTensor: Domain Parallelism for Scientific Machine Learning","primary_cat":"cs.DC","submitted_at":"2026-05-11T18:20:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ShardTensor is a domain-parallelism system for SciML that enables flexible scaling of extreme-resolution spatial datasets by removing the constraint of batch size one per device.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"pipelines, pipeline parallelism can be a compelling option. For the high resolution challenges we seek to address in this paper, it is not necessarily a universally suitable option, and we will not discuss it further here. Finally, a number of bespoke parallelization efforts have been made that should be considered domain parallelism, including Ring Attention [54], Makani [55], and techniques in Transolver ++ [39]. While all excellent demonstrations of the power of domain parallelism at reaching higher spatial or input resolution, they are not easily extendable nor as broadly applicable to new models asShardTensor, as described below. Many of the operations and algorithms in those works have been adopted and implemented inShardTensoras"},{"citing_arxiv_id":"2605.07815","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OrScale: Orthogonalised Optimization with Layer-Wise Trust-Ratio Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-08T14:47:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OrScale adds a Frobenius-norm trust-ratio layer-wise scaler to Muon’s orthogonalized updates, with per-layer calibration for language models, yielding higher CIFAR-10 accuracy and better language-model pre-training loss than Muon+Moonlight and AdamW.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06654","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Optimizer-Model Consistency: Full Finetuning with the Same Optimizer as Pretraining Forgets Less","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:57:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Full finetuning with the pretraining optimizer reduces forgetting compared to other optimizers or LoRA while achieving comparable new-task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00650","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AdaMeZO: Adam-style Zeroth-Order Optimizer for LLM Fine-tuning Without Maintaining the Moments","primary_cat":"cs.LG","submitted_at":"2026-05-01T13:31:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AdaMeZO adapts Adam moment estimates to zeroth-order LLM fine-tuning without extra memory storage, outperforming MeZO with up to 70% fewer forward passes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03346","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Learning PDEs for Portfolio Optimization with Quantum Physics-Informed Neural Networks","primary_cat":"quant-ph","submitted_at":"2026-04-03T10:24:14+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Counterpart PINN are the coefficients of two univariate polynomialsp 1 andp 2 with degree 2, giving a total of 6 parameters. We also apply a commonly used fully connected PINN, theFC PINN, which consists of 5 hidden layers with 10 neurons each and Tanh activations, and contains 481 trainable parameters in total. Hyperparameter for all models.We use thetorch optimizer.Lamb[54] with weight decay=0,betas=(0.0, 0.0). LAMB is a Fig. 10Comparison of loss values for solving the HJB PDE in the Merton portfolio optimization, based on 10 independent runs of the QPINN, Quantum-inspired PINN, Counterpart PINN and FC PINN, each trained for 1000 epochs. Curves represent the geometric mean loss, and shaded regions denote±1 geometric standard deviation."},{"citing_arxiv_id":"2602.21545","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MUON+: Towards More Effective Muon via One Additional Normalization Step for LLM Pre-training","primary_cat":"cs.LG","submitted_at":"2026-02-25T04:04:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Muon+ adds one normalization step after polar orthogonalization in the Muon optimizer, yielding lower training and validation perplexity and faster pre-training across 60M-7B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.18900","ref_index":128,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Foundation Models for Discovery and Exploration in Chemical Space","primary_cat":"physics.chem-ph","submitted_at":"2025-10-20T17:56:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIST models up to 10x larger than prior work, fine-tuned on over 400 structure-property tasks, match or exceed SOTA on benchmarks and demonstrate zero-shot olfactory perception mapping consistent with hyperbolic geometry.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.04988","ref_index":70,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adaptive Memory Momentum via a Model-Based Framework for Deep Learning Optimization","primary_cat":"cs.LG","submitted_at":"2025-10-06T16:24:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents a model-based proximal framework for adaptive momentum in first-order optimizers by using a two-plane approximation of the objective to dynamically set the memory coefficient online.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.12542","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PLD: A Choice-Theoretic List-Wise Knowledge Distillation","primary_cat":"cs.LG","submitted_at":"2025-06-14T15:31:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PLD recasts knowledge distillation as a weighted list-wise ranking loss under the Plackett-Luce model that optimizes a teacher-optimal class ranking and subsumes weighted cross-entropy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.23737","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"On the Convergence Analysis of Muon","primary_cat":"stat.ML","submitted_at":"2025-05-29T17:58:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Convergence analysis shows Muon outperforms gradient descent by exploiting low-rank structure in neural network Hessians.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.13196","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Physics-Inspired Optimizer: Velocity Regularized Adam","primary_cat":"cs.LG","submitted_at":"2025-05-19T14:51:40+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.23947","ref_index":104,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spectral-Adaptive Modulation Networks for Visual Perception","primary_cat":"cs.CV","submitted_at":"2025-03-31T10:53:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPANetV2 is a vision backbone built around a new spectral-adaptive modulation mixer that outperforms prior models on ImageNet-1K classification, COCO detection, and ADE20K segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.07529","ref_index":216,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training Deep Learning Models with Norm-Constrained LMOs","primary_cat":"cs.LG","submitted_at":"2025-02-11T13:10:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Scion is a new stochastic LMO-based optimizer family that unifies existing methods, supports unconstrained problems, and delivers hyperparameter transferability plus speedups on nanoGPT training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.21316","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Optimizer States: Towards Scalable Training of Transformer Models Using Interleaved Offloading","primary_cat":"cs.LG","submitted_at":"2024-10-26T00:43:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Deep Optimizer States splits LLMs into subgroups and uses a performance model to schedule optimizer updates on CPU or GPU, achieving 2.5x faster iterations than prior offloading methods when integrated with DeepSpeed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.16671","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Demystifying CLIP Data","primary_cat":"cs.CV","submitted_at":"2023-09-28T17:59:56+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MetaCLIP curates balanced 400M-pair subsets from CommonCrawl that outperform CLIP data, reaching 70.8% zero-shot ImageNet accuracy on ViT-B versus CLIP's 68.3%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2205.01068","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OPT: Open Pre-trained Transformer Language Models","primary_cat":"cs.CL","submitted_at":"2022-05-02T17:49:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPT releases open decoder-only transformers up to 175B parameters that match GPT-3 performance at one-seventh the carbon cost, along with code and training logs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1910.02054","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ZeRO: Memory Optimizations Toward Training Trillion Parameter Models","primary_cat":"cs.LG","submitted_at":"2019-10-04T17:29:39+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ZeRO removes memory redundancies in parallel training to scale deep learning models to over a trillion parameters with high throughput on current hardware.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1909.11942","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations","primary_cat":"cs.CL","submitted_at":"2019-09-26T07:06:13+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ALBERT reduces BERT parameters via embedding factorization and layer sharing, adds inter-sentence coherence pretraining, and reaches SOTA on GLUE, RACE, and SQuAD with fewer parameters than BERT-large.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1909.08053","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","primary_cat":"cs.CL","submitted_at":"2019-09-17T19:42:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Intra-layer model parallelism in PyTorch enables training of 8.3B-parameter transformers, achieving SOTA perplexity of 10.8 on WikiText103 and 66.5% accuracy on LAMBADA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.11692","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","primary_cat":"cs.CL","submitted_at":"2019-07-26T17:48:29+00:00","verdict":"ACCEPT","verdict_confidence":"HIGH","novelty_score":5.0,"formal_verification":"none","one_line_summary":"With better hyperparameters, more data, and longer training, an unchanged BERT-Large architecture matches or exceeds XLNet and other successors on GLUE, SQuAD, and RACE.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}