{"total":50,"items":[{"citing_arxiv_id":"2605.13724","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-13T16:06:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"AnyFlow enables any-step video diffusion by distilling flow-map transitions over arbitrary time intervals with on-policy backward simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13643","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prefix Teach, Suffix Fade: Local Teachability Collapse in Strong-to-Weak On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:05:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Local teachability collapse in trajectory suffixes makes uniform dense supervision suboptimal in strong-to-weak OPD; truncating at BIC-style change points on teacher margin improves performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13255","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Respecting Self-Uncertainty in On-Policy Self-Distillation for Efficient LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-13T09:38:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EGRSD and CL-EGRSD advance the accuracy-length frontier in LLM reasoning by entropy-guided weighting of token-level distillation signals from the teacher.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12913","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting DAgger in the Era of LLM-Agents","primary_cat":"cs.LG","submitted_at":"2026-05-13T02:40:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DAgger-style training with turn-level policy interpolation raises 4B and 8B LLM agents to 27.3% and 29.8% on SWE-bench Verified, beating several larger published systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12798","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Emergent and Subliminal Misalignment Through the Lens of Data-Mediated Transfer","primary_cat":"cs.LG","submitted_at":"2026-05-12T22:27:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emergent and subliminal misalignment in LLMs arise from data structure interactions and transfer via benign distillation data, with stronger effects under shared functional structure and on-policy settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12741","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning with Rare Success but Rich Feedback via Reflection-Enhanced Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T20:46:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RESD turns failure trajectories into token-level supervision via retrospective reflections and a persistent global playbook, enabling faster improvement than standard self-distillation or GRPO with only one rollout per prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12652","ref_index":71,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Rollout On-Policy Distillation via Peer Successes and Failures","primary_cat":"cs.LG","submitted_at":"2026-05-12T18:57:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MOPD improves on-policy distillation for LLMs by using peer successes for positive patterns and failures for negative examples to create more informative teacher signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12227","ref_index":97,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Combining On-Policy Optimization and Distillation for Long-Context Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:04:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"dGRPO merges outcome-based policy optimization with dense teacher guidance from on-policy distillation, yielding more stable long-context reasoning on the new LongBlocks synthetic dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12004","ref_index":80,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Agentic Policy from Action Guidance","primary_cat":"cs.CL","submitted_at":"2026-05-12T11:54:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ActGuide-RL uses human action data as plan-style guidance in mixed-policy RL to overcome exploration barriers in LLM agents, matching SFT+RL performance on search benchmarks without cold-start training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11853","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GEAR: Granularity-Adaptive Advantage Reweighting for LLM Agents via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T09:38:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GEAR reshapes GRPO trajectory advantages using divergence signals from a ground-truth-conditioned teacher to create adaptive token- and segment-level credit regions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11706","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GRAFT: Graph-Tokenized LLMs for Tool Planning","primary_cat":"cs.LG","submitted_at":"2026-05-12T07:59:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRAFT internalizes tool dependency graphs via dedicated special tokens in LLMs and applies on-policy context distillation to achieve higher exact sequence matching and dependency legality than prior external-graph methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11651","ref_index":37,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hide to See: Reasoning-prefix Masking for Visual-anchored Thinking in VLM Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-12T07:14:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A reasoning-prefix masking strategy during VLM distillation encourages students to anchor their thinking on visual evidence, yielding better multimodal reasoning than prior distillation baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11613","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Generic Correlation to Input-Specific Credit in On-Policy Self Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:43:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-distillation token rewards measure input-response-feedback pointwise mutual information, and CREDIT extracts the input-specific component with contrastive baselines to improve LLM reasoning performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11609","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Anti-Self-Distillation for Reasoning RL via Pointwise Mutual Information","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anti-Self-Distillation reverses self-distillation signals via PMI to fix overconfidence on structural tokens, matching GRPO baseline accuracy 2-10x faster with up to 11.5 point gains across 4B-30B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11505","ref_index":48,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Selective Off-Policy Reference Tuning with Plan Guidance","primary_cat":"cs.AI","submitted_at":"2026-05-12T04:25:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SORT turns all-wrong prompts into selective learning signals by weighting tokens more predictable under plan guidance from reference solutions, improving over GRPO on reasoning benchmarks especially for weaker models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11458","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adaptive Teacher Exposure for Self-Distillation in LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-12T03:15:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ATESD makes teacher exposure to reference reasoning a learnable control variable via a Beta-policy optimized on future student improvement, yielding gains of up to +2.33 points over fixed-exposure self-distillation on AIME and HMMT math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10889","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unmasking On-Policy Distillation: Where It Helps, Where It Hurts, and Why","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:33:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Distillation signals align better with ideal updates on incorrect student rollouts than correct ones, with optimal teacher context depending on student capacity and task.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10781","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rebellious Student: Reversing Teacher Signals for Reasoning Exploration with Self-Distilled RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-11T16:16:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RLRT augments GRPO by reinforcing tokens on correct student rollouts that the teacher would not have predicted, outperforming standard self-distillation and exploration baselines on Qwen3 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10194","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TRACE: Distilling Where It Matters via Token-Routed Self On-Policy Alignment","primary_cat":"cs.AI","submitted_at":"2026-05-11T08:45:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRACE improves math reasoning by distilling only on annotator-marked critical spans with forward KL on correct key spans, optional reverse KL on errors, and GRPO elsewhere, gaining 2.76 points over GRPO while preserving OOD performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10189","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ProteinOPD: Towards Effective and Efficient Preference Alignment for Protein Design","primary_cat":"cs.LG","submitted_at":"2026-05-11T08:38:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProteinOPD uses token-level on-policy distillation from multiple preference-specific teacher models into a shared student to balance competing objectives in protein design, delivering gains on targets without losing designability and an 8x speedup over RL baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09725","ref_index":55,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Distillation with Best-of-N Teacher Rollout Selection","primary_cat":"cs.CV","submitted_at":"2026-05-10T19:49:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BRTS improves on-policy distillation by sampling multiple teacher rollouts and selecting the best one via a correctness-first then alignment priority rule, yielding gains on AIME and AMC math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09536","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TAD: Temporal-Aware Trajectory Self-Distillation for Fast and Accurate Diffusion LLM","primary_cat":"cs.CL","submitted_at":"2026-05-10T13:38:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TAD improves the accuracy-parallelism trade-off in diffusion LLMs via temporal-aware self-distillation that applies hard labels to soon-to-be-decoded tokens and soft supervision to future tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09253","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cornerstones or Stumbling Blocks? Deciphering the Rock Tokens in On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-10T01:41:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Persistent 'Rock Tokens' in on-policy distillation resist teacher corrections, consume large gradient norms, yet add negligible value to reasoning, allowing targeted bypassing to streamline alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08776","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reasoning Compression with Mixed-Policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-05-09T08:04:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mixed-Policy Distillation transfers concise reasoning behavior from larger to smaller LLMs by having the teacher compress student-generated trajectories, cutting token usage up to 27% while raising benchmark scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08401","ref_index":80,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AIPO: : Learning to Reason from Active Interaction","primary_cat":"cs.CL","submitted_at":"2026-05-08T19:06:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AIPO trains LLMs to expand their reasoning capability boundary via active multi-agent interaction with Verify, Knowledge, and Reasoning agents during RLVR, using importance sampling and clipping to handle feedback, then drops the agents at inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07977","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Play Enhancement via Advantage-Weighted Refinement in Online Federated LLM Fine-Tuning with Real-Time Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-08T16:35:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SPEAR enables online federated LLM fine-tuning by using feedback-guided self-play to create contrastive pairs trained with maximum likelihood on correct completions and confidence-weighted unlikelihood on incorrect ones, outperforming baselines without ground-truth contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07725","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SOD: Step-wise On-policy Distillation for Small Language Model Agents","primary_cat":"cs.CL","submitted_at":"2026-05-08T13:30:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOD reweights on-policy distillation strength step-by-step using divergence to stabilize tool use in small language model agents, yielding up to 20.86% gains and 26.13% on AIME 2025 for a 0.6B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07315","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LaTER: Efficient Test-Time Reasoning via Latent Exploration and Explicit Verification","primary_cat":"cs.CL","submitted_at":"2026-05-08T06:23:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LaTER reduces LLM token usage 16-33% on reasoning benchmarks by exploring in latent space then switching to explicit CoT verification, with gains like 70% to 73.3% on AIME 2025 in the training-free version.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06597","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniSD: Towards a Unified Self-Distillation Framework for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-07T17:22:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UniSD unifies complementary self-distillation mechanisms for autoregressive LLMs and achieves up to +5.4 point gains over base models and +2.8 over baselines across six benchmarks and six models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06094","ref_index":53,"ref_count":3,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VISD: Enhancing Video Reasoning via Structured Self-Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-07T12:13:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VISD adds structured privileged feedback from a judge model and a direction-magnitude decoupling trick to let VideoLLMs learn token-level credit assignment while keeping RL stable, yielding higher accuracy and roughly 2x faster convergence on video reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05204","ref_index":116,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"D-OPSD: On-Policy Self-Distillation for Continuously Tuning Step-Distilled Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-06T17:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"D-OPSD enables continuous supervised fine-tuning of few-step diffusion models via on-policy self-distillation where the model acts as both teacher (multimodal context) and student (text-only context) on its own roll-outs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05040","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Preference-Based Self-Distillation: Beyond KL Matching via Reward Regularization","primary_cat":"cs.LG","submitted_at":"2026-05-06T15:31:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PBSD derives a reward-reweighted teacher distribution as the analytic optimum of a reward-regularized objective, yielding better stability and performance than KL-based self-distillation on math reasoning and tool-use tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02971","ref_index":1,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Multilingual Safety Alignment via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-03T14:22:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSD enables cross-lingual safety transfer in LLMs via self-distillation with Dual-Perspective Safety Weighting, improving safety in low-resource languages without target response data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01347","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MAD-OPD: Breaking the Ceiling in On-Policy Distillation via Multi-Agent Debate","primary_cat":"cs.CL","submitted_at":"2026-05-02T09:41:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MAD-OPD recasts on-policy distillation teachers as a debating collective to supply better supervision, lifting agentic and code performance over single-teacher OPD across multiple model sizes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27083","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Co-Evolving Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-29T18:24:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoPD integrates multiple expert capabilities by running parallel RLVR training with bidirectional online policy distillation among experts, outperforming mixed RLVR and sequential OPD while surpassing domain-specific experts on text-image-video reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24005","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TCOD: Exploring Temporal Curriculum in On-Policy Distillation for Multi-turn Autonomous Agents","primary_cat":"cs.LG","submitted_at":"2026-04-27T03:38:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TCOD stabilizes on-policy distillation for multi-turn agents via temporal curriculum on trajectory depth, improving performance up to 18 points over vanilla OPD and sometimes surpassing the teacher.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23333","ref_index":88,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Process Supervision of Confidence Margin for Calibrated LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-04-25T14:40:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RLCM trains LLMs with a margin-enhanced process reward that widens the gap between correct and incorrect reasoning steps, improving calibration on math, code, logic, and science tasks without hurting accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23318","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hidden States Know Where Reasoning Diverges: Credit Assignment via Span-Level Wasserstein Distance","primary_cat":"cs.CL","submitted_at":"2026-04-25T14:11:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Span-level Wasserstein distances between hidden-state distributions of correct and incorrect rollouts provide a self-supervised signal to reweight advantages in GRPO, improving fine-grained credit assignment on math and code tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20733","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Near-Future Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-22T16:20:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NPO uses a policy's own near-future checkpoint as auxiliary trajectories to maximize effective learning signal S = Q/V, improving performance from 57.88 to 63.15 on Qwen3-VL-8B-Instruct with GRPO while accelerating convergence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15577","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reward Weighted Classifier-Free Guidance as Policy Improvement in Autoregressive Models","primary_cat":"cs.LG","submitted_at":"2026-04-16T23:13:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reward-weighted classifier-free guidance approximates Q-function policy improvement in autoregressive models, enabling test-time reward optimization and faster RL convergence via distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14054","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"$\\pi$-Play: Multi-Agent Self-Play via Privileged Self-Distillation without External Data","primary_cat":"cs.LG","submitted_at":"2026-04-15T16:34:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"π-Play uses self-generated question construction paths as privileged information in multi-agent self-distillation to convert sparse-reward self-play into a dense-feedback loop, surpassing supervised search agents and improving efficiency 2-3× over standard self-play.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13010","ref_index":11,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Lightning OPD: Efficient Post-Training for Large Reasoning Models with Offline On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-14T17:44:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lightning OPD is an offline on-policy distillation method that matches standard OPD performance at 4x efficiency by enforcing teacher consistency between SFT and distillation phases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10674","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Skill-SD: Skill-Conditioned Self-Distillation for Multi-turn LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-04-12T14:57:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Skill-SD turns an agent's completed trajectories into dynamic natural-language skills that condition only the teacher in self-distillation, yielding 14-42% gains over RL and OPSD baselines on multi-turn agent benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10500","ref_index":80,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Visual Enhanced Depth Scaling for Multimodal Latent Reasoning","primary_cat":"cs.CV","submitted_at":"2026-04-12T07:14:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Visual replay module and adaptive depth scaling improve multimodal latent reasoning, reaching SOTA benchmarks with faster inference than explicit chain-of-thought methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10029","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Distilled Reinforcement Learning for Co-Evolving Agentic Recommender Systems","primary_cat":"cs.IR","submitted_at":"2026-04-11T04:52:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoARS enables co-evolving recommender and user agents by using interaction-derived rewards and self-distilled credit assignment to internalize multi-turn feedback into model parameters, outperforming prior agentic baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08527","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Demystifying OPD: Length Inflation and Stabilization Strategies for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-09T17:58:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPD for LLMs suffers length inflation and repetition collapse; StableOPD uses reference divergence and rollout mixing to prevent it and improve math reasoning performance by 7.2% on average.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07944","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"On-Policy Distillation of Language Models for Autonomous Vehicle Motion Planning","primary_cat":"cs.RO","submitted_at":"2026-04-09T08:06:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"On-policy GKD trains 5x smaller student LLMs to nearly match large teacher performance in AV motion planning on nuScenes while beating a dense-feedback RL baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07941","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Large Language Model Post-Training: A Unified View of Off-Policy and On-Policy Learning","primary_cat":"cs.CL","submitted_at":"2026-04-09T08:00:37+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM post-training is unified as off-policy or on-policy interventions that expand support for useful behaviors, reshape policies within reachable states, or consolidate behavior across training stages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07809","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PolicyLong: Towards On-Policy Context Extension","primary_cat":"cs.LG","submitted_at":"2026-04-09T05:07:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PolicyLong shifts long-context data synthesis to an on-policy loop that re-screens contexts using the evolving model's entropy landscape, producing a self-curriculum that outperforms static offline baselines with larger gains at longer lengths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03128","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Distilled RLVR","primary_cat":"cs.LG","submitted_at":"2026-04-03T15:50:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RLSD mixes self-distillation for token-level policy difference magnitudes with RLVR for reliable update directions from response correctness to reach higher convergence and better training stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}