{"total":12,"items":[{"citing_arxiv_id":"2606.01117","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HASTE: Hardware-Aware Dynamic Sparse Training for Large Output Spaces","primary_cat":"cs.LG","submitted_at":"2026-05-31T09:25:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HASTE proposes group-shared fixed fan-in sparsity and dense-sparse output decomposition to deliver up to 25x backward speedup and near-dense precision in large-scale XMC.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02608","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pruning Deep Neural Networks via the Marchenko--Pastur Distribution","primary_cat":"cs.LG","submitted_at":"2026-05-23T19:44:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Marchenko-Pastur random-matrix pruning of DNNs yields theoretical certificates for accuracy preservation under small fine-tuning and empirical ImageNet results with 50-60% MAC reduction and sub-2pp accuracy drops on ViT and CNN models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06402","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SparseForge: Efficient Semi-Structured LLM Sparsification via Annealing of Hessian-Guided Soft-Mask","primary_cat":"cs.LG","submitted_at":"2026-05-07T15:11:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SparseForge achieves 57.27% zero-shot accuracy on LLaMA-2-7B at 2:4 sparsity using only 5B retraining tokens, beating the dense baseline and nearly matching a 40B-token SOTA method.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10627","ref_index":85,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Computational Lesions in Multilingual Language Models Separate Shared and Language-specific Brain Alignment","primary_cat":"cs.CL","submitted_at":"2026-04-12T13:06:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Lesioning a shared core in multilingual LLMs drops whole-brain fMRI encoding correlation by 60.32%, while language-specific lesions selectively weaken predictions only for the matched native language.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10560","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Heterogeneous Connectivity in Sparse Networks: Fan-in Profiles, Gradient Hierarchy, and Topological Equilibria","primary_cat":"cs.LG","submitted_at":"2026-04-12T10:04:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Arbitrary heterogeneous fan-in profiles in sparse networks match uniform random accuracy at high sparsity, but initializing RigL dynamic sparse training with equilibrium-matched lognormal profiles improves performance by up to 0.49% on classification tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.24558","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Probabilistic Computers for Neural Quantum States","primary_cat":"quant-ph","submitted_at":"2025-12-31T01:42:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FPGA probabilistic computers speed up sampling for neural quantum states, delivering accurate energies on 80x80 Ising lattices and training deep models on 30x30 systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.12448","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Optimized Architectures for Kolmogorov-Arnold Networks","primary_cat":"cs.LG","submitted_at":"2025-12-13T20:14:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Overprovisioned KANs with sparsification, deep supervision, and depth selection under differentiable MDL yield smaller models with competitive accuracy on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.25606","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Effective Model Pruning: Measure The Redundancy of Model Components","primary_cat":"cs.LG","submitted_at":"2025-09-30T00:01:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EMP maps importance scores to effective sample size N_eff and prunes the lowest N - N_eff components, with a derived lower bound on retained effective mass and upper bound on loss increase.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.15077","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty","primary_cat":"cs.LG","submitted_at":"2024-01-26T18:59:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EAGLE resolves feature-level uncertainty in speculative sampling via one-step token advancement, delivering 2.7x-3.5x speedup on LLaMA2-Chat 70B and doubled throughput across multiple model families and tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.01801","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Model Tells You What to Discard: Adaptive KV Cache Compression for LLMs","primary_cat":"cs.CL","submitted_at":"2023-10-03T05:17:08+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FastGen adaptively compresses LLM KV caches via lightweight attention profiling: evicting long-range contexts on local heads, non-special tokens on special-token heads, and retaining full caches on broad-attention heads, yielding substantial memory savings with negligible quality loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.02277","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Junk DNA Hypothesis: Pruning Small Pre-Trained Weights Irreversibly and Monotonically Impairs \"Difficult\" Downstream Tasks in LLMs","primary_cat":"cs.LG","submitted_at":"2023-09-29T22:55:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pruning small-magnitude weights from pre-trained LLMs causes monotonic irreversible performance degradation on difficult downstream tasks, supporting the Junk DNA Hypothesis that these weights hold essential knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.02311","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaLM: Scaling Language Modeling with Pathways","primary_cat":"cs.CL","submitted_at":"2022-04-05T16:11:45+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaLM 540B demonstrates continued scaling benefits by setting new few-shot SOTA results on hundreds of benchmarks and outperforming humans on BIG-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}