{"total":34,"items":[{"citing_arxiv_id":"2605.12741","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning with Rare Success but Rich Feedback via Reflection-Enhanced Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T20:46:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RESD turns failure trajectories into token-level supervision via retrospective reflections and a persistent global playbook, enabling faster improvement than standard self-distillation or GRPO with only one rollout per prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12483","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond GRPO and On-Policy Distillation: An Empirical Sparse-to-Dense Reward Principle for Language-Model Post-Training","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:57:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sparse RL on a strong teacher followed by dense distillation to the student outperforms direct GRPO on the student for math tasks, with a forward-KL + OPD bridge enabling further gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12070","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Missing Old Logits in Asynchronous Agentic RL: Semantic Mismatch and Repair Methods for Off-Policy Correction","primary_cat":"cs.LG","submitted_at":"2026-05-12T12:57:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Missing old logits in async agentic RL entangle discrepancy and staleness terms in PPO off-policy correction; exact acquisition methods and revised PPO-EWMA restore decoupled updates with reported gains in speed and performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11739","ref_index":69,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning to Foresee: Unveiling the Unlocking Efficiency of On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-12T08:19:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy distillation gains efficiency from early foresight in module allocation and low-rank update directions, enabling EffOPD to accelerate training by 3x via adaptive extrapolation without extra modules or tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11609","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Anti-Self-Distillation for Reasoning RL via Pointwise Mutual Information","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anti-Self-Distillation reverses self-distillation signals via PMI to fix overconfidence on structural tokens, matching GRPO baseline accuracy 2-10x faster with up to 11.5 point gains across 4B-30B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10889","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unmasking On-Policy Distillation: Where It Helps, Where It Hurts, and Why","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:33:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Distillation signals align better with ideal updates on incorrect student rollouts than correct ones, with optimal teacher context depending on student capacity and task.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09725","ref_index":44,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Distillation with Best-of-N Teacher Rollout Selection","primary_cat":"cs.CV","submitted_at":"2026-05-10T19:49:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BRTS improves on-policy distillation by sampling multiple teacher rollouts and selecting the best one via a correctness-first then alignment priority rule, yielding gains on AIME and AMC math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09253","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cornerstones or Stumbling Blocks? Deciphering the Rock Tokens in On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-10T01:41:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Persistent 'Rock Tokens' in on-policy distillation resist teacher corrections, consume large gradient norms, yet add negligible value to reasoning, allowing targeted bypassing to streamline alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08762","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Omni-DeepSearch: A Benchmark for Audio-Driven Omni-Modal Deep Search","primary_cat":"cs.SD","submitted_at":"2026-05-09T07:47:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Omni-DeepSearch is a 640-sample benchmark for audio-driven omni-modal search where the best model reaches only 43.44% accuracy, exposing bottlenecks in audio inference, tool use, and cross-modal reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08761","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond the All-in-One Agent: Benchmarking Role-Specialized Multi-Agent Collaboration in Enterprise Workflows","primary_cat":"cs.MA","submitted_at":"2026-05-09T07:47:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EntCollabBench shows that today's LLM agents still struggle with delegation, context transfer, parameter grounding, workflow closure, and decision commitment when tested in a simulated enterprise with 11 role-specialized agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08741","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training with Harnesses: On-Policy Harness Self-Distillation for Complex Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-09T07:06:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPHSD uses harness-augmented models as teachers to distill reasoning capabilities into base LLMs, yielding strong standalone performance on classification and math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08063","ref_index":20,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Flow-OPD: On-Policy Distillation for Flow Matching Models","primary_cat":"cs.CV","submitted_at":"2026-05-08T17:50:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Flow-OPD applies on-policy distillation to flow matching models, achieving GenEval of 92 and OCR accuracy of 94 on Stable Diffusion 3.5 Medium while avoiding the seesaw effect of multi-reward optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07442","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GameGen-Verifier: Parallel Keypoint-Based Verification for LLM-Generated Games via Runtime State Injection","primary_cat":"cs.LG","submitted_at":"2026-05-08T08:46:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GameGen-Verifier decomposes game specifications into keypoints, injects runtime states for targeted checks, and achieves 92.2% accuracy on 100 games while running up to 16.6x faster than agent-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07396","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rubric-based On-policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:52:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Rubric-based on-policy distillation allows training student models using only teacher responses by generating scoring rubrics from contrasts and using them for on-policy optimization, achieving superior performance and up to 10x better sample efficiency than logit-based approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06221","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniPrefill: Universal Long-Context Prefill Acceleration via Block-wise Dynamic Sparsification","primary_cat":"cs.CL","submitted_at":"2026-05-07T13:18:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniPrefill accelerates LLM prefill via block-wise dynamic sparsification, achieving up to 2.1x TTFT speedup while supporting hybrid architectures and native vLLM continuous batching.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03677","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Uni-OPD: Unifying On-Policy Distillation with a Dual-Perspective Recipe","primary_cat":"cs.LG","submitted_at":"2026-05-05T12:15:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Uni-OPD unifies on-policy distillation across LLMs and MLLMs with dual-perspective strategies that promote student exploration and enforce order-consistent teacher supervision based on outcome rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02971","ref_index":11,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Multilingual Safety Alignment via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-03T14:22:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSD enables cross-lingual safety transfer in LLMs via self-distillation with Dual-Perspective Safety Weighting, improving safety in low-resource languages without target response data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27083","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Co-Evolving Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-29T18:24:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoPD integrates multiple expert capabilities by running parallel RLVR training with bidirectional online policy distillation among experts, outperforming mixed RLVR and sequential OPD while surpassing domain-specific experts on text-image-video reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21850","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OptiMat Alloys: a FAIR, living database of multi-principal element alloys enabled by a conversational agent","primary_cat":"cond-mat.mtrl-sci","submitted_at":"2026-04-23T16:40:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OptiMat Alloys is a conversational AI system that maintains a living FAIR database of multi-principal element alloy calculations and enables natural-language, on-demand computations with built-in uncertainty checks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15039","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Prefill-as-a-Service: KVCache of Next-Generation Models Could Go Cross-Datacenter","primary_cat":"cs.DC","submitted_at":"2026-04-16T14:07:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PrfaaS enables practical cross-datacenter prefill-decode disaggregation for hybrid-attention models via selective offloading, bandwidth-aware scheduling, and cache-aware placement, yielding 54% higher throughput and 64% lower P90 TTFT than homogeneous baselines in a 1T-parameter case study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13016","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking On-Policy Distillation of Large Language Models: Phenomenology, Mechanism, and Recipe","primary_cat":"cs.LG","submitted_at":"2026-04-14T17:54:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy distillation works when student and teacher models share thinking patterns and the teacher adds new capabilities, with success tied to alignment on a small set of high-probability tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13010","ref_index":5,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Lightning OPD: Efficient Post-Training for Large Reasoning Models with Offline On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-14T17:44:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lightning OPD is an offline on-policy distillation method that matches standard OPD performance at 4x efficiency by enforcing teacher consistency between SFT and distillation phases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11912","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Transformers Learn to Plan via Multi-Token Prediction","primary_cat":"cs.LG","submitted_at":"2026-04-13T18:04:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-token prediction induces a two-stage reverse reasoning process in Transformers via gradient decoupling, improving planning on synthetic and realistic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10098","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-04-11T08:41:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The first survey on Attention Sink in Transformers structures the literature around fundamental utilization, mechanistic interpretation, and strategic mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08527","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Demystifying OPD: Length Inflation and Stabilization Strategies for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-09T17:58:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPD for LLMs suffers length inflation and repetition collapse; StableOPD uses reference divergence and rollout mixing to prevent it and improve math reasoning performance by 7.2% on average.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07941","ref_index":121,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Large Language Model Post-Training: A Unified View of Off-Policy and On-Policy Learning","primary_cat":"cs.CL","submitted_at":"2026-04-09T08:00:37+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM post-training is unified as off-policy or on-policy interventions that expand support for useful behaviors, reshape policies within reachable states, or consolidate behavior across training stages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07054","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sell More, Play Less: Benchmarking LLM Realistic Selling Skill","primary_cat":"cs.CL","submitted_at":"2026-04-08T13:06:37+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SalesLLM provides an automatic evaluation framework for LLM sales dialogues that correlates 0.98 with human experts and shows top models approaching human performance while weaker ones lag.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05688","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Attention Editing: A Versatile Framework for Cross-Architecture Attention Conversion","primary_cat":"cs.CL","submitted_at":"2026-04-07T10:40:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attention Editing converts pre-trained LLMs to new attention architectures through layer-wise teacher-forced optimization and model-level distillation, preserving performance with efficiency gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05623","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DetailVerifyBench: A Benchmark for Dense Hallucination Localization in Long Image Captions","primary_cat":"cs.CV","submitted_at":"2026-04-07T09:27:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DetailVerifyBench supplies 1,000 images and densely annotated long captions to evaluate precise hallucination localization in multimodal large language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03128","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Distilled RLVR","primary_cat":"cs.LG","submitted_at":"2026-04-03T15:50:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RLSD mixes self-distillation for token-level policy difference magnitudes with RLVR for reliable update directions from response correctness to reach higher convergence and better training stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03044","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"JoyAI-LLM Flash: Advancing Mid-Scale LLMs with Token Efficiency","primary_cat":"cs.CL","submitted_at":"2026-04-03T13:52:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JoyAI-LLM Flash delivers a 48B MoE LLM with 2.7B active parameters per token via FiberPO RL and dense multi-token prediction, released with checkpoints on Hugging Face.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.01496","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From SWE-ZERO to SWE-HERO: Execution-free to Execution-based Fine-tuning for Software Engineering Agents","primary_cat":"cs.SE","submitted_at":"2026-04-02T00:11:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A two-stage SFT pipeline distills execution-free then execution-based trajectories from a 480B model into smaller Qwen2.5-Coder agents, yielding 62.2% resolution on SWE-bench Verified and 44.1% zero-shot on the multilingual version.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.00626","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey of On-Policy Distillation for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-01T08:32:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"On-policy distillation reframes LLM knowledge transfer as iterative correction on student trajectories rather than single-pass imitation, with the survey organizing the field along divergence design, feedback sources, and training stabilization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.18734","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-01-26T17:56:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A single LLM improves its own reasoning by self-distilling from privileged verified traces as teacher to its question-only student policy, outperforming off-policy distillation and RL on math benchmarks with better token efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}