{"total":19,"items":[{"citing_arxiv_id":"2605.13643","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Prefix Teach, Suffix Fade: Local Teachability Collapse in Strong-to-Weak On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:05:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Local teachability collapse in trajectory suffixes makes uniform dense supervision suboptimal in strong-to-weak OPD; truncating at BIC-style change points on teacher margin improves performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12741","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning with Rare Success but Rich Feedback via Reflection-Enhanced Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T20:46:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RESD turns failure trajectories into token-level supervision via retrospective reflections and a persistent global playbook, enabling faster improvement than standard self-distillation or GRPO with only one rollout per prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11679","ref_index":61,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Explaining and Breaking the Safety-Helpfulness Ceiling via Preference Dimensional Expansion","primary_cat":"cs.AI","submitted_at":"2026-05-12T07:38:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MORA breaks the safety-helpfulness ceiling in LLMs by pre-sampling single-reward prompts and rewriting them to incorporate multi-dimensional intents, delivering 5-12.4% gains in sequential alignment and 4.6% overall improvement in simultaneous alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11636","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Seir\\^enes: Adversarial Self-Play with Evolving Distractions for LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-12T06:58:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Seirênes trains LLMs via adversarial self-play to generate and overcome evolving distractions, producing gains of 7-10 points on math reasoning benchmarks and exposing blind spots in larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09959","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"G-Zero: Self-Play for Open-Ended Generation from Zero Data","primary_cat":"cs.LG","submitted_at":"2026-05-11T04:12:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"G-Zero uses the Hint-δ intrinsic reward to drive co-evolution between a Proposer and Generator via GRPO and DPO, providing a theoretical suboptimality guarantee for self-improvement from internal dynamics alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09214","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Fast Rates for Offline Contextual Bandits with Forward-KL Regularization under Single-Policy Concentrability","primary_cat":"cs.LG","submitted_at":"2026-05-09T23:17:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper establishes the first tilde O(epsilon^{-1}) upper bounds and matching lower bounds for forward-KL-regularized offline contextual bandits under single-policy concentrability in both tabular and general function approximation settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08703","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RewardHarness: Self-Evolving Agentic Post-Training","primary_cat":"cs.AI","submitted_at":"2026-05-09T05:32:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RewardHarness self-evolves a tool-and-skill library from 100 preference examples to reach 47.4% accuracy on image-edit evaluation, beating GPT-5, and yields stronger RL-tuned models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08037","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Pairs: Your Language Model is Secretly Optimizing a Preference Graph","primary_cat":"cs.LG","submitted_at":"2026-05-08T17:26:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GraphDPO generalizes pairwise DPO to a graph-structured Plackett-Luce objective over DAGs induced by rollout rankings, enforcing transitivity with linear complexity and recovering DPO as a special case.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07248","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaT: Planning-after-Trial for Efficient Test-Time Code Generation","primary_cat":"cs.CL","submitted_at":"2026-05-08T05:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaT defers planning until after failed trials in LLM code generation, enabling heterogeneous cheap-plus-powerful model setups that match large-model performance at roughly 69% lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02626","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Gradient-Gated DPO: Stabilizing Preference Optimization in Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-04T14:15:24+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Gate-DPO attenuates gradients on low-probability rejected responses to reduce probability collapse and improve chosen-response likelihood during preference optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20933","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"IRIS: Interpolative R\\'enyi Iterative Self-play for Large Language Model Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-04-22T11:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"IRIS unifies self-play fine-tuning under an interpolative Rényi objective with adaptive alpha scheduling and reports better benchmark scores than baselines while surpassing full supervised fine-tuning with only 13% of the annotated data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18834","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Structural Verification for Reliable EDA Code Generation without Tool-in-the-Loop Debugging","primary_cat":"cs.SE","submitted_at":"2026-04-20T20:58:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Structural dependency graphs and staged pre-execution verification raise LLM-based EDA code pass rates to 82.5% (single-step) and 70-84% (multi-step) while halving tool calls by catching dependency violations before runtime.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18034","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SignDPO: Multi-level Direct Preference Optimisation for Skeleton-based Gloss-free Sign Language Translation","primary_cat":"cs.CL","submitted_at":"2026-04-20T09:59:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SignDPO uses hierarchical perturbations, self-guided attention-based sampling, and an automated language-level preference generator to align skeleton trajectories with linguistic semantics, outperforming prior gloss-free methods on CSL-Daily, How2Sign, and OpenASL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15602","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GroupDPO: Memory efficient Group-wise Direct Preference Optimization","primary_cat":"cs.CL","submitted_at":"2026-04-17T00:56:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GroupDPO decouples group-wise preference optimization during backpropagation to cut peak memory while keeping the same gradients, allowing larger groups and consistent gains over single-pair DPO plus an NLL term on positives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15034","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Autogenesis: A Self-Evolving Agent Protocol","primary_cat":"cs.AI","submitted_at":"2026-04-16T14:04:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Autogenesis Protocol defines resource and evolution layers for LLM agents, enabling a system that shows performance gains on long-horizon planning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14054","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"$\\pi$-Play: Multi-Agent Self-Play via Privileged Self-Distillation without External Data","primary_cat":"cs.LG","submitted_at":"2026-04-15T16:34:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"π-Play uses self-generated question construction paths as privileged information in multi-agent self-distillation to convert sparse-reward self-play into a dense-feedback loop, surpassing supervised search agents and improving efficiency 2-3× over standard self-play.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03472","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Vocabulary Dropout for Curriculum Diversity in LLM Co-Evolution","primary_cat":"cs.CL","submitted_at":"2026-04-03T21:40:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vocabulary dropout prevents diversity collapse in LLM co-evolution by masking proposer logits, yielding average +4.4 point solver gains on mathematical reasoning benchmarks at 8B scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.01306","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"KTO: Model Alignment as Prospect Theoretic Optimization","primary_cat":"cs.LG","submitted_at":"2024-02-02T10:53:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KTO aligns LLMs by directly maximizing prospect-theoretic utility on binary signals and matches or exceeds preference-based methods like DPO from 1B to 30B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.10020","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Self-Rewarding Language Models","primary_cat":"cs.CL","submitted_at":"2024-01-18T14:43:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Iterative self-rewarding via LLM-as-Judge in DPO training on Llama 2 70B improves instruction following and self-evaluation, outperforming GPT-4 on AlpacaEval 2.0.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}