{"total":33,"items":[{"citing_arxiv_id":"2606.23235","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A First-Order Mean Field Control Analysis of Transformer Layers under Cross-Entropy Training","primary_cat":"math.OC","submitted_at":"2026-06-22T12:21:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Transformer residual layers are approximated as an explicit Euler scheme for a controlled hidden-state flow whose mean-field limit is a first-order transport control problem with Pontryagin terminal condition given by the softmax residual.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23467","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S$^3$GNN: Efficient Global Mixing and Local Message Passing for Long-Range Graph Learning","primary_cat":"cs.LG","submitted_at":"2026-05-22T10:26:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"S³GNN mitigates oversquashing in message-passing networks via lightweight global mixing without strong prior assumptions, yielding up to 10x error reduction and 50% fewer parameters across multiple domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23259","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Gate Residuals","primary_cat":"cs.LG","submitted_at":"2026-05-22T06:00:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Multi-Gate Residuals stabilizes activation scales in deep residual networks via multi-stream gating and attention pooling without added communication overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21724","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TBP-mHC: full expressivity for manifold-constrained hyper connections through transportation polytopes","primary_cat":"cs.LG","submitted_at":"2026-05-20T20:31:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TBP-mHC proposes parameterizations of the Birkhoff polytope via transportation polytopes that achieve exact double stochasticity for hyper-connections using only (n-1)^2 degrees of freedom.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20708","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Cross-Layer Information Routing in Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-20T05:07:15+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17842","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SNLP: Layer-Parallel Inference via Structured Newton Corrections","primary_cat":"cs.LG","submitted_at":"2026-05-18T04:28:16+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15793","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AOT-POT: Adaptive Operator Transformation for Large-Scale PDE Pre-training","primary_cat":"cs.LG","submitted_at":"2026-05-15T09:50:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AOT-POT adaptively reshapes complex PDE solution operators via input-dependent transformations and parallel stream mixing to enable effective large-scale pre-training, yielding SOTA results on 12 benchmarks with minimal added parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18855","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Delta Attention Residuals","primary_cat":"cs.LG","submitted_at":"2026-05-13T16:05:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Delta Attention Residuals attend over per-sublayer deltas instead of cumulative hidden states, producing higher-contrast attention weights and 1.7-8.2% validation perplexity gains over standard and attention residuals across 220M-7.6B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18848","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Exact Linear Attention","primary_cat":"cs.LG","submitted_at":"2026-05-13T08:06:48+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12374","ref_index":24,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-12T16:41:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11526","ref_index":75,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient and provably convergent end-to-end training of deep neural networks with linear constraints","primary_cat":"math.OC","submitted_at":"2026-05-12T04:51:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An efficiently computable HS-Jacobian acts as a conservative mapping for projections onto polyhedral sets, supporting provably convergent Adam-based end-to-end training of linearly constrained deep neural networks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ping, nonsmooth automatic differentiation 1 Introduction Designing deep neural networks (DNNs) whose outputs of selected layers satisfy certain linear constraints has become increasingly important in contemporary applications, such as deep learning based portfolio allocation [72], deep graph matching [71], and the topological architecture design of DNNs [75]. Without loss of generality, assume that the outputxof a given layer in the DNN satisfies the linear constraints P:={y∈R n |Ay≤a, By=b},(1) ∗The authors contributed equally to this work. †Corresponding author:yancheng.yuan@polyu.edu.hk. 1 arXiv:2605.11526v1 [math.OC] 12 May 2026 whereA∈R m×n, a∈R m, B∈R l×n, b∈R l are given and the matrixBis of full row rank."},{"citing_arxiv_id":"2605.11172","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Optimistic Dual Averaging Unifies Modern Optimizers","primary_cat":"cs.LG","submitted_at":"2026-05-11T19:30:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SODA unifies several modern optimizers under optimistic dual averaging and supplies a 1/k decay wrapper that improves performance without weight decay tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08300","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"mHC-SSM: Manifold-Constrained Hyper-Connections for State Space Language Models with Stream-Specialized Adapters","primary_cat":"cs.LG","submitted_at":"2026-05-08T11:37:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Manifold-constrained multi-stream mixing plus per-stream adapters improves SSM language model validation loss from 6.3507 to 6.1353 and perplexity from 572.91 to 461.88 on WikiText-2.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"mixed precision, and the output is cast back to the input dtype. Finally, the recurrence output is gated and projected: 𝑧𝑡̂ = 𝑧𝑡⨀𝜎(𝑔𝑡) (8) 𝑆𝑆𝑀𝐵𝑙𝑜𝑐𝑘(ℎ) = 𝐿𝑖𝑛𝑒𝑎𝑟𝐷(𝑧̂) (9) With dropout applied if configured. 2.2.4 Output and Loss After 𝐿 blocks, a final normalization is applied and a tied linear language modeling head produces logits ℓ = 𝐿𝑖𝑛𝑒𝑎𝑟𝑉(𝑁𝑜𝑟𝑚(ℎ𝐿)) (10) Cross-entropy loss is computed between ℓ and target tokens 𝑦 across all positions. Perplexity is computed as exp (𝑣𝑎𝑙_𝑙𝑜𝑠𝑠), a standard conversion used in language modeling. 2.3 Hyper-Connections and mHC Background The mHC paper introduces the HC formulation by expanding the residual stream width by a factor 𝑛. For the 𝑙-th layer, HC defines propagation as"},{"citing_arxiv_id":"2605.06501","ref_index":89,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cubit: Token Mixer with Kernel Ridge Regression","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:18:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Cubit replaces Transformer's attention with a closed-form Kernel Ridge Regression token mixer and reports larger gains as training sequence length increases.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[87] Mitchell Wortsman, Jaehoon Lee, Justin Gilmer, and Simon Kornblith. Replacing softmax with relu in vision transformers.arXiv preprint arXiv:2309.08586, 2023. [88] Zhenda Xie, Yixuan Wei, Huanqi Cao, Chenggang Zhao, Chengqi Deng, Jiashi Li, Damai Dai, Huazuo Gao, Jiang Chang, Kuai Yu, et al. mhc: Manifold-constrained hyper-connections.arXiv preprint arXiv:2512.24880, 2025. [89] Jingjing Xu, Xu Sun, Zhiyuan Zhang, Guangxiang Zhao, and Junyang Lin. Understanding and improving layer normalization.Advances in neural information processing systems, 32, 2019. [90] Mingyu Xu, Tenglong Ao, Jiaao He, Jianqiao Lu, Guang Shi, and Shu Zhong. Deltaformer: Unlock the state space of transformer. InThe Thirty-ninth Annual Conference on Neural"},{"citing_arxiv_id":"2605.06729","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The E$\\Delta$-MHC-Geo Transformer: Adaptive Geodesic Operations with Guaranteed Orthogonality","primary_cat":"cs.LG","submitted_at":"2026-05-07T11:37:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The EΔ-MHC-Geo Transformer achieves input-adaptive unconditionally orthogonal residual connections via a Cayley-based rotation that works for all parameters, combined with a learned hybrid gate for reflections.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05330","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Graph Normalization: Fast Binarizing Dynamics for Differentiable MWIS","primary_cat":"cs.LG","submitted_at":"2026-05-06T18:02:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Graph Normalization is a convergent dynamical system that approximates MWIS by always reaching a binary maximum independent set via majorization-minimization and evolutionary game equivalence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04421","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FLUID: Continuous-Time Hyperconnected Sparse Transformer for Sink-Free Learning","primary_cat":"cs.LG","submitted_at":"2026-05-06T02:27:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FLUID is a continuous-time transformer using Liquid Attention Networks to model attention as stable ODE solutions that interpolate between discrete SDPA and CT-RNNs, with an explicit sink gate and liquid hyper-connections for better information flow.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03953","ref_index":17,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Transformers with Selective Access to Early Representations","primary_cat":"cs.LG","submitted_at":"2026-05-05T16:38:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SATFormer uses a context-dependent gate for selective reuse of early Transformer representations, improving validation loss and zero-shot accuracy especially on retrieval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23705","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Can an MLP Absorb Its Own Skip Connection?","primary_cat":"cs.LG","submitted_at":"2026-04-26T13:37:27+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Skip-connected MLPs and residual-free MLPs of equal width represent generically disjoint function classes for common activations, with explicit impossibility proofs and a non-generic absorption condition for ReLU and GELU.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23036","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Preserving Long-Tailed Expert Information in Mixture-of-Experts Tuning","primary_cat":"cs.LG","submitted_at":"2026-04-24T21:48:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new SFT framework for MoE models combines bias-driven sparsification with gated condenser experts to retain long-tailed expert information, outperforming DenseMixer and ESFT by over 2.5% on math reasoning and commonsense QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21254","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hyperloop Transformers","primary_cat":"cs.LG","submitted_at":"2026-04-23T03:46:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Hyperloop Transformers outperform standard and mHC Transformers with roughly 50% fewer parameters by looping a middle block of layers and applying hyper-connections only after each loop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21027","ref_index":147,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","primary_cat":"cs.AI","submitted_at":"2026-04-22T19:18:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HypEHR is a hyperbolic embedding model for EHR data that uses Lorentzian geometry and hierarchy-aware pretraining to answer clinical questions nearly as well as large language models but with much smaller size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19550","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LoopCTR: Unlocking the Loop Scaling Power for Click-Through Rate Prediction","primary_cat":"cs.IR","submitted_at":"2026-04-21T15:06:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LoopCTR trains CTR models with recursive layer reuse and process supervision so that zero-loop inference outperforms baselines on public and industrial datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19147","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Nexusformer: Nonlinear Attention Expansion for Stable and Inheritable Transformer Scaling","primary_cat":"cs.LG","submitted_at":"2026-04-21T06:54:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Nexusformer uses a three-stage nonlinear mapping in attention to enable stable, inheritable scaling of transformers, matching baseline perplexity with up to 41.5% less compute when growing from 240M to 440M parameters.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"utilization, parameter stability, and downstream performance. We perform principal component analysis (PCA) on {xt}T t=1 to extract the dominant variation modes. Let the eigenvector matrix be V= [v 1, v2, v3], λ 1 ≥λ 2 ≥λ 3,(22) where λi is the eigenvalue of the i-th principal component. Empirically, the first two components explain 92.49% of the total variance: λ1 +λ 2 λ1 +λ 2 +λ 3 = 92.49%.(23) We therefore project each statex t onto the 2D subspace spanned by the first two components: zt =V ⊤ 1:2xt ∈R 2.(24) This embedding provides a compact and interpretable space for the geometric analysis below. C.1. Grassmann Manifold To quantify the evolution on the parameter manifold, we project the model states into a three-dimensional ambient space"},{"citing_arxiv_id":"2604.15069","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond the Laplacian: Doubly Stochastic Matrices for Graph Neural Networks","primary_cat":"cs.LG","submitted_at":"2026-04-16T14:33:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DsmNet substitutes Laplacian matrices with approximated doubly stochastic matrices in GNNs, using Neumann truncation and residual mass compensation to achieve O(K|E|) efficiency and bound Dirichlet energy decay for reduced over-smoothing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13556","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"YOCO++: Enhancing YOCO with KV Residual Connections for Efficient LLM Inference","primary_cat":"cs.CL","submitted_at":"2026-04-15T07:05:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"YOCO++ enhances YOCO by adding weighted residual KV connections from bottom layers, delivering state-of-the-art results among cross-layer compression methods at 50% KV cache reduction and outperforming the standard Transformer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11947","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ResBM: Residual Bottleneck Models for Low-Bandwidth Pipeline Parallelism","primary_cat":"cs.LG","submitted_at":"2026-04-13T18:40:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ResBM achieves 128x activation compression in pipeline-parallel transformer training by adding a residual bottleneck module that preserves a low-rank identity path, with no major loss in convergence or added overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.15031","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attention Residuals","primary_cat":"cs.CL","submitted_at":"2026-03-16T09:32:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Attention Residuals replaces fixed residual summation with input-dependent softmax attention over preceding layers, and a blocked variant is shown to improve uniformity and downstream performance in a 48B-parameter model pre-trained on 1.4T tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03263","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LPC-SM: Local Predictive Coding and Sparse Memory for Long-Context Language Modeling","primary_cat":"cs.CL","submitted_at":"2026-03-12T21:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LPC-SM is a hybrid architecture separating local attention, persistent memory, predictive correction, and control with ONT for memory writes, showing loss reductions on 158M-parameter models up to 4096-token contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.13381","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Linearity in Attention Projections: The Case for Nonlinear Queries","primary_cat":"cs.LG","submitted_at":"2026-03-11T03:13:10+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.08064","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SiameseNorm: Breaking the Barrier to Reconciling Pre/Post-Norm","primary_cat":"cs.LG","submitted_at":"2026-02-08T17:17:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SiameseNorm is a two-stream architecture that reconciles Pre-Norm and Post-Norm in Transformers by coupling streams via shared residual blocks, yielding performance gains with maintained stability on language, vision, and diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.18832","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Geometric Reasoner: Manifold-Informed Latent Foresight Search for Long-Context Reasoning","primary_cat":"cs.LG","submitted_at":"2026-01-25T18:16:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TGR performs manifold-informed latent foresight search to boost trajectory coverage in long-context reasoning tasks by up to 13 AUC points with minimal overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.00417","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Delta Learning","primary_cat":"cs.LG","submitted_at":"2026-01-01T18:11:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Deep Delta Learning replaces additive residual updates with a gated delta-rule that selectively overwrites residual content along learned directions, improving language modeling quality over standard ResNet-style accumulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}