{"total":15,"items":[{"citing_arxiv_id":"2606.00628","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robust Reasoning via Dynamic Token Selection for Distribution-Aligned Self-Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-30T09:03:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DASD dynamically selects tokens in self-distillation to keep logical corrections while suppressing stylistic noise, improving robustness on math, code, and commonsense benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27971","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Semantic Flow Regularization: Teaching LLMs to Generate Diverse Yet Coherent Responses","primary_cat":"cs.CL","submitted_at":"2026-05-27T05:05:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SFR applies conditional flow matching on future sentence embeddings as a training regularizer to increase output diversity in style-conditioned LLMs without deployment overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27881","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Retrieval, Reward, and Training Protocols: What Matters in Training Search Agents?","primary_cat":"cs.CL","submitted_at":"2026-05-27T03:04:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Controlled empirical study shows correcting Wikipedia data coverage yields larger gains than algorithm differences in LLM search agent training, with outcome-based rewards competitive.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22675","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Policy Distillation via Capability-Selective Subspace Projection","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:18:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-Policy Distillation extracts a capability subspace from model gradients on correctness tokens, projects KV activations into it for self-generation, and fine-tunes LLMs to achieve up to 13-16% gains over baselines without external signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22263","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Tailoring Teaching to Aptitude: Direction-Adaptive Self-Distillation for LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T10:07:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASD improves math reasoning in LLMs by adaptively directing self-distillation based on per-token entropy to balance exploration and step accuracy, outperforming prior self-distillation and RLVR baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18529","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AMR-SD: Asymmetric Meta-Reflective Self-Distillation for Token-Level Credit Assignment","primary_cat":"cs.AI","submitted_at":"2026-05-18T15:14:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AMR-SD adds a reflection bottleneck to compress diagnostic signals into self-generated hints and uses asymmetric Causal Information Gain to create sparse token-level advantage signals, outperforming baselines and preventing late-stage collapse in RLVR.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18141","ref_index":56,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Brief Overview: On-Policy Self-Distillation In Large Language Models","primary_cat":"cs.HC","submitted_at":"2026-05-18T09:47:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"This overview paper explains the conceptual foundations and design principles of On-Policy Self-Distillation for large language models from a beginner's perspective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13255","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Respecting Self-Uncertainty in On-Policy Self-Distillation for Efficient LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-13T09:38:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EGRSD and CL-EGRSD advance the accuracy-length frontier in LLM reasoning by entropy-guided weighting of token-level distillation signals from the teacher.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06597","ref_index":21,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniSD: Towards a Unified Self-Distillation Framework for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-07T17:22:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniSD unifies self-distillation components for autoregressive LLMs and its full integrated version improves base models by 5.4 points and baselines by 2.8 points across six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05851","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hypothesis generation and updating in large language models","primary_cat":"cs.LG","submitted_at":"2026-05-07T08:24:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs exhibit Bayesian-like hypothesis updating with strong-sampling bias and an evaluation-generation gap but generalize poorly outside observed data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05040","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Preference-Based Self-Distillation: Beyond KL Matching via Reward Regularization","primary_cat":"cs.LG","submitted_at":"2026-05-06T15:31:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PBSD derives a reward-reweighted teacher distribution as the analytic optimum of a reward-regularized objective, yielding better stability and performance than KL-based self-distillation on math reasoning and tool-use tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"the optimizer is unique and can be obtained from the first-order optimality conditions. Introducing a Lagrange multiplierλfor the normalization constraint P y πy = 1, the Lagrangian is J(π, λ) = X y πyry −β X y πy log πy πteachy +λ X y πy −1 ! .(20) Differentiating with respect to each coordinateπ y gives ∂J ∂πy =r y −β \u0012 log πy πteachy + 1 \u0013 +λ= 0.(21) Here we used the identity ∂ ∂πy \u0012 πy log πy πteachy \u0013 = log πy πteachy + 1. Rearranging Eq. (21) yields log πy πteachy = ry +λ−β β ,(22) and exponentiating both sides gives πy =π teach y exp(ry/β) exp((λ−β)/β).(23) The last exponential factor is independent of y, so all coordinates share the same proportionality constant. To determine it, impose the normalization constraint:"},{"citing_arxiv_id":"2605.04542","ref_index":138,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Power Distribution Bridges Sampling, Self-Reward RL, and Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-06T06:42:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The power distribution is the target of power sampling, the closed-form solution to self-reward KL-regularized RL, and the basis for power self-distillation that matches sampling performance at lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01130","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Iterative Finetuning is Mostly Idempotent","primary_cat":"cs.AI","submitted_at":"2026-05-01T22:01:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Iterative self-finetuning of LLMs mostly fails to amplify seeded behavioral traits, with amplification limited to specific DPO setups and often harming coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24809","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Nautile-370M: Spectral Memory Meets Attention in a Small Reasoning Model","primary_cat":"cs.LG","submitted_at":"2026-04-27T08:07:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Nautile-370M is a hybrid small language model using SeqCond Attention layers alternating with transformers, with a claimed proof that the spectral operator matches full self-attention expressiveness in the continuous limit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08532","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Improving 4D Perception via Self-Distillation","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SelfEvo enables pretrained 4D perception models to self-improve on unlabeled videos via self-distillation, delivering up to 36.5% relative gains in video depth estimation and 20.1% in camera estimation across eight benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}