{"total":25,"items":[{"citing_arxiv_id":"2605.12288","ref_index":140,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TokenRatio: Principled Token-Level Preference Optimization via Ratio Matching","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:44:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TBPO derives a token-level preference optimization objective from sequence-level pairwise data via Bregman divergence ratio matching that generalizes DPO and improves alignment quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11974","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Order Fairness: Mitigating LLMs Order Sensitivity through Dual Group Advantage Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-12T11:31:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DGAO uses reinforcement learning to optimize LLMs for both accuracy and order stability by balancing intra-group accuracy advantages and inter-group stability advantages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11613","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Generic Correlation to Input-Specific Credit in On-Policy Self Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:43:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-distillation token rewards measure input-response-feedback pointwise mutual information, and CREDIT extracts the input-specific component with contrastive baselines to improve LLM reasoning performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11299","ref_index":18,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Primal Generation, Dual Judgment: Self-Training from Test-Time Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-11T22:34:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DuST self-trains LLMs for code generation by ranking their own test-time samples via sandbox execution and applying GRPO, improving judgment by +6.2 NDCG and single-sample pass@1 by +3.1 on LiveCodeBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08873","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CoDistill-GRPO: A Co-Distillation Recipe for Efficient Group Relative Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-09T10:51:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CoDistill-GRPO lets small and large models mutually improve via co-distillation in GRPO, raising small-model math accuracy by over 11 points while cutting large-model training time by about 18%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08558","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Static Bias: Adaptive Multi-Fidelity Bandits with Improving Proxies","primary_cat":"cs.LG","submitted_at":"2026-05-08T23:36:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TACC algorithm for adaptive multi-fidelity bandits with improving proxies achieves instance-dependent regret by replacing logarithmic high-fidelity pulls with bounded low-fidelity continuation for intermediate arms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07465","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SEIF: Self-Evolving Reinforcement Learning for Instruction Following","primary_cat":"cs.CL","submitted_at":"2026-05-08T09:13:12+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEIF creates a self-reinforcing loop in which an LLM alternately generates increasingly difficult instructions and learns to follow them better using reinforcement learning signals from its own judgments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06642","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StraTA: Incentivizing Agentic Reinforcement Learning with Strategic Trajectory Abstraction","primary_cat":"cs.CL","submitted_at":"2026-05-07T17:51:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StraTA improves LLM agent success rates to 93.1% on ALFWorld and 84.2% on WebShop by sampling a compact initial strategy and training it jointly with action execution via hierarchical GRPO-style rollouts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03042","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ARIS: Autonomous Research via Adversarial Multi-Agent Collaboration","primary_cat":"cs.SE","submitted_at":"2026-05-04T18:10:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ARIS is a three-layer open-source system that uses cross-model adversarial collaboration plus claim-auditing pipelines to make LLM-driven research workflows more reliable.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02073","ref_index":24,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Enhanced LLM Reasoning by Optimizing Reward Functions with Search-Driven Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-05-03T22:01:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Iterative search over reward functions with ranked feedback in GRPO training improves LLM math reasoning, achieving F1 of 0.795 on GSM8K versus 0.609 for baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27488","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Skills-Coach: A Self-Evolving Skill Optimizer via Training-Free GRPO","primary_cat":"cs.CL","submitted_at":"2026-04-30T06:39:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Skills-Coach optimizes LLM agent skills via task generation, prompt/code tuning, comparative execution, and traceable evaluation, reporting gains on a 48-skill benchmark called Skill-X.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20933","ref_index":89,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"IRIS: Interpolative R\\'enyi Iterative Self-play for Large Language Model Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-04-22T11:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"IRIS unifies self-play fine-tuning under an interpolative Rényi objective with adaptive alpha scheduling and reports better benchmark scores than baselines while surpassing full supervised fine-tuning with only 13% of the annotated data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18002","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Neural Garbage Collection: Learning to Forget while Learning to Reason","primary_cat":"cs.LG","submitted_at":"2026-04-20T09:26:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Language models learn to evict KV cache entries end-to-end via reinforcement learning from outcome reward alone, achieving 2-3x cache compression while maintaining accuracy on Countdown, AMC, and AIME tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17543","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PoliLegalLM: A Technical Report on a Large Language Model for Political and Legal Affairs","primary_cat":"cs.CL","submitted_at":"2026-04-19T17:06:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PoliLegalLM, trained with continued pretraining, progressive SFT, and preference RL on a legal corpus, outperforms similar-scale models on LawBench, LexEval, and a real-world PoliLegal dataset while staying competitive with much larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11611","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Utilizing and Calibrating Hindsight Process Rewards via Reinforcement with Mutual Information Self-Evaluation","primary_cat":"cs.CL","submitted_at":"2026-04-13T15:18:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MISE proves that hindsight self-evaluation rewards equal minimizing mutual information plus KL divergence to a proxy policy, and experiments show 7B LLMs reaching GPT-4o-level results on validation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09791","ref_index":102,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Pioneer Agent: Continual Improvement of Small Language Models in Production","primary_cat":"cs.AI","submitted_at":"2026-04-10T18:13:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pioneer Agent automates the full lifecycle of adapting and continually improving small language models via diagnosis-driven data synthesis and regression-constrained retraining, delivering gains of 1.6-83.8 points on benchmarks and large lifts in production-style tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08094","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MedThink: Enhancing Diagnostic Accuracy in Small Models via Teacher-Guided Reasoning Correction","primary_cat":"cs.CY","submitted_at":"2026-04-09T18:00:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MedThink, a two-stage teacher-guided reasoning correction distillation framework, boosts small language models' medical diagnostic accuracy by up to 12.7% on benchmarks and achieves 56.4% on a gastroenterology dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03993","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can LLMs Learn to Reason Robustly under Noisy Supervision?","primary_cat":"cs.LG","submitted_at":"2026-04-05T06:30:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Online Label Refinement lets LLMs learn robust reasoning from noisy supervision by correcting labels when majority answers show rising rollout success and stable history, delivering 3-4% gains on math and reasoning benchmarks even at high noise levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.21046","ref_index":195,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey of Self-Evolving Agents: What, When, How, and Where to Evolve on the Path to Artificial Super Intelligence","primary_cat":"cs.AI","submitted_at":"2025-07-28T17:59:05+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper delivers the first systematic review of self-evolving agents, structured around what components evolve, when adaptation occurs, and how it is implemented.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.05579","ref_index":285,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods","primary_cat":"cs.CL","submitted_at":"2024-12-07T08:07:24+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey that organizes LLMs-as-judges research into functionality, methodology, applications, meta-evaluation, and limitations.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"[294], HALU-J [227],PROMETHEUS [109], PROMETHEUS 2 [110], PROMETHEUS-VISION [122], LLaVA-Critic [253] Post-processing(§4.1.3) ProbabilityCalibration Daynauth et al. [45], ProbDiff [247], PoE [150], CRISPR [264] Text ReprocessingSottana et al. [206], AUTO-J [130], Yan et al. [262], Tessler et al. [214], REVISEVAL [281]Ren et al. [186],Open-LLM-Leaderboard [161] Multi-LLM (§4.2) Communication(§4.2.1)Cooperation WideDeep [285], Xu et al. [261], ABSEval [136] Competition Owens et al. [168], Auto-Arena [286], Bandi et al. [12], Moniri et al. [158], ChatEval [22], PRD [132] Aggregation (§4.2.2)Badshah et al. [8], PoLL [225], Language-Model-as-an-Examiner [10], MULTI-NEWS+ [38], PiCO [165], PRE [39], Chen et al. [27],Zhang et al. [284], AIME [175], HD-EVAL [147], Gao et al."},{"citing_arxiv_id":"2406.17557","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale","primary_cat":"cs.CL","submitted_at":"2024-06-25T13:50:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FineWeb is a curated 15T-token web dataset that produces stronger LLMs than prior open collections, while its educational subset sharply improves performance on MMLU and ARC benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.07496","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TextGrad: Automatic \"Differentiation\" via Text","primary_cat":"cs.CL","submitted_at":"2024-06-11T17:32:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TextGrad performs automatic differentiation for compound AI systems by backpropagating natural-language feedback from LLMs to optimize variables ranging from code to molecular structures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.13372","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models","primary_cat":"cs.CL","submitted_at":"2024-03-20T08:08:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LlamaFactory provides a unified no-code framework for efficient fine-tuning of 100+ LLMs via an integrated web UI and has been released on GitHub.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.01306","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"KTO: Model Alignment as Prospect Theoretic Optimization","primary_cat":"cs.LG","submitted_at":"2024-02-02T10:53:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KTO aligns LLMs by directly maximizing prospect-theoretic utility on binary signals and matches or exceeds preference-based methods like DPO from 1B to 30B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.01335","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models","primary_cat":"cs.LG","submitted_at":"2024-01-02T18:53:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPIN lets weak LLMs become strong by self-generating training data from previous model versions and training to prefer human-annotated responses over its own outputs, outperforming DPO even with extra GPT-4 data on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}