{"total":68,"items":[{"citing_arxiv_id":"2607.01480","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Procedural Memory Distillation: Online Reflection for Self-Improving Language Models","primary_cat":"cs.AI","submitted_at":"2026-07-01T21:20:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PMD extracts and distills cross-episode procedural knowledge from RL rollouts into LLM policies at three abstraction levels, yielding 3.8-13.6% gains over SDPO on SCIKNOWEVAL and LIVECODEBENCH via co-evolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30626","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DOPD: Dual On-policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-06-29T17:55:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DOPD is an advantage-aware dual distillation method that dynamically assigns token supervision from either privileged teacher or student to transfer capability while mitigating non-replicable information asymmetry in on-policy distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30445","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Does Online Imitation Learning Help in LLM Post-Training? The Role of (Non-)Realizability Beyond Horizon","primary_cat":"cs.LG","submitted_at":"2026-06-29T15:17:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Online IL overcomes an information-theoretic bottleneck that offline IL faces in non-realizable settings even at horizon 1, under a new structural characterization of reward-relative misspecification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29863","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KbSD: Knowledge Boundary aware Self-Distillation for Behavioral Calibration in Agentic Search","primary_cat":"cs.CL","submitted_at":"2026-06-29T06:56:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"KbSD uses a same-size hint-augmented teacher and quadrant-adaptive KL objectives to deliver dense supervision for calibrated behavior across knowledge states in agentic search.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29502","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UCOB: Learning to Utilize and Evolve Agentic Skills via Credit-Aware On-Policy Bidirectional Self-Distillation","primary_cat":"cs.AI","submitted_at":"2026-06-28T17:02:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UCOB improves agentic RL by using return-to-go comparisons between skill-conditioned and no-skill prompts as local teachers for bidirectional self-distillation and skill memory updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28562","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SEAD: Competence-Aware On-Policy Distillation via Entropy-Guided Supervision","primary_cat":"cs.CL","submitted_at":"2026-06-26T19:41:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEAD applies entropy-guided token selection, KL annealing, and easy-to-hard curriculum to on-policy distillation and reports +4.8 average accuracy gain over vanilla OPD on six math benchmarks with OLMo-3 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28166","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tandem Reinforcement Learning with Verifiable Rewards","primary_cat":"cs.AI","submitted_at":"2026-06-26T15:00:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRL extends tandem training to RLVR pipelines, matching GRPO solo reasoning on Qwen3-4B math tasks while improving handoff robustness, reducing distributional drift, and increasing CoT legibility for the junior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26790","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPID: On-Policy Skill Distillation for Agentic Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-06-25T09:24:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"OPID distills episode- and step-level skills from completed on-policy trajectories, routes them via critical-first mechanism, and combines the resulting log-probability shift advantage with outcome advantage for policy optimization in language agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25319","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"V-Zero: Answer-Label-Free On-Policy Distillation with Contrastive Evidence Gating for Fine-Grained Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-24T02:32:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"V-Zero trains MLLMs for visual reasoning without answer labels by gating on-policy distillation trajectories using contrastive evidence from relevant versus negative image crops.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24143","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AsyncOPD: How Stale Can On-Policy Distillation Be?","primary_cat":"cs.LG","submitted_at":"2026-06-23T04:50:49+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AsyncOPD shows asynchronous OPD training reaches 1.6-3.8x higher throughput than synchronous baselines with comparable accuracy by using forward-KL estimators and multi-sample Monte Carlo correction for finite teacher caches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24084","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Blockwise Policy-Drift Gating for On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-23T02:58:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Blockwise policy-drift gating raises mean pass@8 from 0.4978 to 0.5160 on four math benchmarks by reweighting OPD losses with detached mean-normalized gates from student policy drift over 64-token blocks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23104","ref_index":73,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReNIO: Reweighting Negative Trajectory Importance for LLM On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-22T09:46:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReNIO reweights negative student-generated trajectories in LLM on-policy distillation using probability ratios, reporting relative gains up to 10% on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22830","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Finding the Evidence: Discovering Decision-Supporting Tokens for On-Policy Reasoning Distillation","primary_cat":"cs.AI","submitted_at":"2026-06-22T04:13:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DEAR identifies decision tokens via entropy and evidence tokens via cosine similarity plus divergence to improve on-policy reasoning distillation over standard methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22793","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Formula-Driven Survey and Research Agenda for On-Policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-06-22T03:09:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A survey creates a taxonomy for on-policy distillation in LLMs that separates temporal credit assignment from vocabulary-level probability routing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22600","ref_index":19,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Position Bias of On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-21T17:20:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Position bias in on-policy distillation degrades later-token supervision; IW-OPD weights tokens by accumulated discrepancy, yielding faster convergence and up to 6.9 point gains on AIME-2025.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21994","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prefix-Guided On-Policy Distillation: Mining Golden Trajectories from Rollouts","primary_cat":"cs.LG","submitted_at":"2026-06-20T11:18:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PG-OPD uses early prefix overlap to selectively continue only high-compatibility rollouts in on-policy distillation, reporting up to 4.8 accuracy points gained and 2.46x less training time on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19659","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAGE-OPD: Selective Agent-Guided Intervention for Multi-Turn On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-06-17T23:58:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SAGE-OPD improves multi-turn OPD via turn-level selective intervention, teacher-confidence weighting, and loss normalization, reporting up to 13.3% relative gain in ALFWorld unseen success rate over standard OPD.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18216","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Zone of Proximal Policy Optimization: Teacher in Prompts, Not Gradients","primary_cat":"cs.CL","submitted_at":"2026-06-16T17:46:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ZPPO improves distillation to small vision-language models by using binary and negative candidate prompts plus a replay buffer for hard questions, outperforming standard distillation and GRPO on a 31-benchmark suite with largest gains at the 0.8B scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18195","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning from the Self-future: On-policy Self-distillation for dLLMs","primary_cat":"cs.CL","submitted_at":"2026-06-16T17:24:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"d-OPSD reframes on-policy self-distillation for dLLMs via suffix conditioning from self-generated answers and step-level supervision, outperforming RLVR and SFT on reasoning benchmarks with ~10% of the optimization steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17199","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PowerOPD: Stabilizing On-Policy Distillation with Bounded Power Transformation","primary_cat":"cs.LG","submitted_at":"2026-06-15T18:37:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PowerOPD applies the Box-Cox power transformation to create natively bounded, sign-consistent rewards for on-policy distillation, delivering up to +6.37 Avg@8 gains over vanilla OPD on math reasoning benchmarks while cutting compute costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13657","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dense Supervision, Sparse Updates: On the Sparsity and Geometry of On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-11T17:54:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy distillation produces coordinate-sparse, FFN-heavy updates that are full-rank but spectrally concentrated away from principal singular subspaces and near-zero source weights.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12634","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Keep Policy Gradient in Charge: Sibling-Guided Credit Distillation for Long-Horizon Tool-Use Agents","primary_cat":"cs.LG","submitted_at":"2026-06-10T19:53:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SGCD improves held-out scores on AppWorld and tau^3-airline by using LLM-summarized sibling contrasts to reshape GRPO advantages while keeping policy gradient in charge of the actor update.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12507","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rubric-Guided Self-Distillation: Post-Training Without Rubric Verifiers","primary_cat":"cs.LG","submitted_at":"2026-06-10T17:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RGSD distills rubric-conditioned teacher distributions into base policies token-by-token, matching GRPO rubric satisfaction on Qwen models with one rollout and zero verifier calls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10385","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Absolute Imitation: Anchored Residual Guidance for Privileged On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-09T03:51:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AR-OPD disentangles privileged supervision via anchored residual guidance to reduce hindsight leakage in on-policy distillation, reporting gains of 2.3 points over full privileged OPD and 7.9 over SFT on reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09471","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Escaping the KL Agreement Trap in On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-08T13:28:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KAT detects persistent low-KL agreement traps in on-policy distillation via a dynamic threshold to filter weak supervision, improving avg@k by 2.66% and pass@k by 3.43% on four math benchmarks while shortening rollouts by 59.73%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09348","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PBSD: Privileged Bayesian Self-Distillation for Long-Horizon Credit Assignment","primary_cat":"cs.LG","submitted_at":"2026-06-08T11:20:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PBSD derives autoregressive turn-level credit signals from outcome rewards via the posterior-to-prior ratio converted through Bayes' rule between student and privileged teacher models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09076","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Scalar Rewards by Internalizing Reasoning into Score Distributions","primary_cat":"cs.CV","submitted_at":"2026-06-08T06:20:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Z-Reward trains a 27B reasoning teacher VLM on score distributions via GDSO and distills it via RISD into a 9B student, reaching 89.6% and 88.6% human preference accuracy with 41.3% optimization gain over SFT baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07082","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Geometry of On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-05T09:20:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPD updates occupy a relaxed off-principal regime and rapidly lock into a low-dimensional subspace that is functionally sufficient for its performance, distinct from SFT and RLVR trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07000","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Teaching the Way, Not the Answer: Privileged Tutoring Distillation for Multimodal Policy Optimization","primary_cat":"cs.AI","submitted_at":"2026-06-05T07:43:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PTD-PO supplies step-wise token-distribution supervision to student policies via in-context privileged hints derived from spatial attention and intermediate reasoning, while keeping the student in an answer-free context and using Top-K Jensen-Shannon divergence for stable alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06021","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPRD: On-Policy Representation Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-04T11:13:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPRD performs distillation in hidden-state space on on-policy data for deterministic gradients and better math benchmark performance, plus OPRD-Bridge for cross-architecture transfer via low-rank projectors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05718","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ViCuR: Visual Cues as Recoverable Privilege for Multimodal On-Policy Distillation","primary_cat":"cs.CV","submitted_at":"2026-06-04T05:18:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViCuR introduces recoverable visual cues as teacher privilege in multimodal on-policy distillation, yielding +1.19 to +1.24 average gains over answer-based baselines across seven benchmarks with Qwen3-VL students.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05152","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement Learning from Rich Feedback with Distributional DAgger","primary_cat":"cs.LG","submitted_at":"2026-06-03T17:54:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DistIL applies distributional DAgger with forward cross-entropy to achieve monotonic policy improvement and better Pass@N from rich feedback in RL for reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04703","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Continual Experience Internalization for Self-Evolving LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-06-03T10:30:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Existing methods for turning LLM interaction experience into parametric skills collapse over multiple iterations; principle-level experience, step-wise injection, and off-policy teacher distillation yield more stable continual learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04694","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DuDi: Dual-Signal Distillation with Cross-Lingual Verbalizer","primary_cat":"cs.CL","submitted_at":"2026-06-03T10:23:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DuDi is a dual-signal distillation method with cross-lingual verbalizer that improves multilingual SLM performance on SEA languages and outperforms baselines on SEA-HELM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03620","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Physics-Guided Policy Optimization with Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-02T13:20:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PGPO modulates per-step trust in self-distilled updates via a mutual-information estimate derived from a viscous-fluid analogy, preserves SGD weak-approximation order, and reports gains of up to 4.5 points on Science-QA while avoiding late-training collapse.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03532","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Should the Teacher Move? Temporal Coupling and Stability in Self On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-02T11:54:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Isolation periods between teacher updates stabilize self on-policy distillation, and a consolidation-gated refresh rule eliminates collapse across four tasks without per-task retuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02684","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Filter, Then Reweight: Rethinking Optimization Granularity in On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-01T17:58:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FiRe-OPD introduces a two-stage filter-then-soft-reweight procedure for trajectory- and token-level supervision in on-policy distillation, claiming gains over prior token-level methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02530","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SafeSteer: Localized On-Policy Distillation for Efficient Safety Alignment","primary_cat":"cs.AI","submitted_at":"2026-06-01T17:38:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SafeSteer restricts reverse KL penalty to safety tokens selected via activation steering, achieving strong safety on seven benchmarks with minimal degradation on five capability benchmarks using only 100 harmful samples and no general data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01476","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniOPD: Logit-Free On-Policy Distillation via Speculative Verification","primary_cat":"cs.LG","submitted_at":"2026-05-31T22:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniOPD replaces token-level logit matching in on-policy distillation with Monte Carlo chunk-level semantic verification and a peak-entropy scheduler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00755","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Internalize the Temperature: On-Policy Self-Distillation as Policy Reheater for Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TS-OPSD internalizes temperature via on-policy self-distillation to reheat entropy-collapsed RL policies in LLMs, providing stronger initialization for further training than continued RL or rollout temperature adjustment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31490","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Are Full Rollouts Necessary for On-Policy Distillation?","primary_cat":"cs.CL","submitted_at":"2026-05-29T16:12:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Truncated and progressively lengthening rollouts in on-policy distillation match full-rollout performance on mathematical reasoning while using as little as 10% of the horizon and improving efficiency up to 3x.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31159","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Trust-Region Behavior Blending for On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-29T11:06:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRB introduces a KL-trust-region warmup for on-policy distillation that blends toward teacher behavior early in training and anneals to zero, reporting the highest average performance across two math-reasoning distillation experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30070","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Predictive Law for On-Policy Self-Distillation From World Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-28T15:17:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A linear relationship between initial student-self-teacher performance gap and OPSD improvement provides a predictive law across contexts and model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28139","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Data-Efficient On-Policy Distillation for Automatic Speech Recognition","primary_cat":"cs.AI","submitted_at":"2026-05-27T08:22:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"On-policy distillation from a Qwen-ASR teacher improves a 0.6B Ark-ASR model over supervised fine-tuning and a same-scale baseline on four of five ASR benchmarks using 100k hours of speech.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27115","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Counteraction-Aware Multi-Teacher On-Policy Distillation for General Capability Recovery with Domain Preservation","primary_cat":"cs.AI","submitted_at":"2026-05-26T14:52:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CaMOPD recovers general capabilities in domain-specialized LLMs via alternating training and gap-based sample selection in multi-teacher on-policy distillation while preserving domain behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22675","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Policy Distillation via Capability-Selective Subspace Projection","primary_cat":"cs.CL","submitted_at":"2026-05-21T16:18:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-Policy Distillation extracts a capability subspace from model gradients on correctness tokens, projects KV activations into it for self-generation, and fine-tunes LLMs to achieve up to 13-16% gains over baselines without external signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22263","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tailoring Teaching to Aptitude: Direction-Adaptive Self-Distillation for LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T10:07:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASD improves math reasoning in LLMs by adaptively directing self-distillation based on per-token entropy to balance exploration and step accuracy, outperforming prior self-distillation and RLVR baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22166","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adapting the Interface, Not the Model: Runtime Harness Adaptation for Deterministic LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T08:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Life-Harness evolves reusable interventions from training trajectories to enhance frozen LLM agents on unseen tasks across seven deterministic environments, yielding 88.5% average relative improvement in 116 of 126 model-environment settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18141","ref_index":27,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Brief Overview: On-Policy Self-Distillation In Large Language Models","primary_cat":"cs.HC","submitted_at":"2026-05-18T09:47:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"This overview paper explains the conceptual foundations and design principles of On-Policy Self-Distillation for large language models from a beginner's perspective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17862","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$\\boldsymbol{f}$-OPD: Stabilizing Long-Horizon On-Policy Distillation with Freshness-Aware Control","primary_cat":"cs.LG","submitted_at":"2026-05-18T05:14:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"f-OPD decomposes on-policy distillation drift into rollout and supervision components, then applies a sample-level freshness score to adaptively limit stale data influence and stabilize long-horizon agent training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}