{"total":13,"items":[{"citing_arxiv_id":"2605.21606","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Are Teacher Tokens Reliable? Position-Weighted On-Policy Self-Distillation for Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-20T18:14:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Position-Weighted On-Policy Self-Distillation (PW-OPSD) weights later tokens more heavily after a diagnostic shows position predicts teacher reliability better than entropy, yielding +1.0 and +1.1 Avg@12 gains on AIME 2024/2025.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16826","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoupling KL and Trajectories: A Unified Perspective for SFT, DAgger, Offline RL, and OPD in LLM Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-16T06:05:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Decoupling prefix source from token-level KL direction in autoregressive sequence KL yields four objectives unifying SFT, DAgger, offline RL and OPD, with KL mixing and entropy-gated curriculum improving math reasoning accuracy and shortening responses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05940","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Near-Policy: Accelerating On-Policy Distillation via Asynchronous Generation and Selective Packing","primary_cat":"cs.LG","submitted_at":"2026-05-07T09:50:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NPD accelerates on-policy distillation 8.1 times faster than baselines by using asynchronous SFT with Δ-IFD filtering, outperforming standard SFT and enabling a 1B model to achieve 68.73% SOTA score.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14084","ref_index":8,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TIP: Token Importance in On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-15T16:58:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A two-axis taxonomy of student entropy and teacher-student divergence identifies informative tokens in on-policy distillation, allowing near-full performance with 10-50% of tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.11178","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PACED: Distillation and On-Policy Self-Distillation at the Frontier of Student Competence","primary_cat":"cs.AI","submitted_at":"2026-03-11T18:00:05+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PACED applies student pass-rate weighting w(p)=p(1-p) to distillation, concentrating on the zone of proximal development and delivering up to +8.2 gains on AIME tasks with reduced forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.20816","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Don't Ignore the Tail: Decoupling top-K Probabilities for Efficient Language Model Distillation","primary_cat":"cs.CL","submitted_at":"2026-02-24T11:54:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A modified divergence decouples top-K teacher probabilities from the distribution tail during distillation, yielding competitive performance on decoder models with standard compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.10248","ref_index":184,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Step-Video-T2V Technical Report: The Practice, Challenges, and Future of Video Foundation Model","primary_cat":"cs.CV","submitted_at":"2025-02-14T15:58:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Step-Video-T2V describes a 30B-parameter text-to-video model with custom Video-VAE, 3D DiT, flow matching, and Video-DPO that claims state-of-the-art results on a new internal benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.00724","ref_index":255,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models","primary_cat":"cs.AI","submitted_at":"2024-08-01T17:16:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical analysis shows scaling inference compute via strategies like tree search can be more efficient than scaling model parameters, with 7B models plus novel search outperforming 34B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.13116","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey on Knowledge Distillation of Large Language Models","primary_cat":"cs.CL","submitted_at":"2024-02-20T16:17:37+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A comprehensive survey of knowledge distillation for LLMs structured around algorithms, skill enhancement, and vertical applications, highlighting data augmentation as a key enabler.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"and ranking of responses based on the teacher's preferences. PRO (Song et al., 2023a) expands the concept of pairwise comparison to handle preference rankings of any length. For a given instruction x and a sequence of responses ordered by teacher preference as y1 ≻ y2 ≻ ... ≻ yn, the RPO training objective is: LPRO = − n−1X k=1 log exp (pk)Pn i=k exp (pi) , (16) where pk represents the conditional log probabilities for yk under the student policy πθ. By iteratively contrasting the likelihood of generating responses, PRO optimizes the student LM to prioritize the most preferred response while progressively ranking the rest in the order of diminishing preference. 4 S KILL DISTILLATION Building upon the foundation laid out in Section 3 about"},{"citing_arxiv_id":"2302.01318","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Accelerating Large Language Model Decoding with Speculative Sampling","primary_cat":"cs.CL","submitted_at":"2023-02-02T18:44:11+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Speculative sampling accelerates LLM decoding 2-2.5x by letting a draft model propose short sequences that the target model scores in parallel, then applies modified rejection sampling to keep the exact target distribution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.15089","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Continuous diffusion for categorical data","primary_cat":"cs.CL","submitted_at":"2022-11-28T06:08:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper proposes CDCD, a continuous-time and continuous-space diffusion framework for categorical data, and reports results on language modeling tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.11804","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Memory- and Communication-Aware Model Compression for Distributed Deep Learning Inference on IoT","primary_cat":"stat.ML","submitted_at":"2019-07-26T22:17:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NoNN partitions a teacher model into disjoint compressed students via network science for distributed IoT inference, matching teacher accuracy with far lower per-device memory and communication.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.06017","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learn Spelling from Teachers: Transferring Knowledge from Language Models to Sequence-to-Sequence Speech Recognition","primary_cat":"eess.AS","submitted_at":"2019-07-13T06:27:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Knowledge distillation from an external RNN language model to a seq2seq ASR model yields 9.3% CER on Chinese datasets, an 18.42% relative improvement over the baseline without test-time fusion components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}