{"total":22,"items":[{"citing_arxiv_id":"2605.13803","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoGround: Self-Evolving Video Agents for Video Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-13T17:25:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A proposer-solver agent pair achieves supervised-level video temporal grounding and fine-grained captioning from 2.5K unlabeled videos via self-reinforcing evolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13775","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RoboEvolve: Co-Evolving Planner-Simulator for Robotic Manipulation with Limited Data","primary_cat":"cs.RO","submitted_at":"2026-05-13T16:54:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A co-evolutionary VLM-VGM loop on 500 unlabeled images raises planner success by 30 points and simulator success by 48 percent while beating fully supervised baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13369","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Query-Conditioned Test-Time Self-Training for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-13T11:27:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QueST lets LLMs create query-conditioned problem-solution pairs at inference time and use them for parameter-efficient self-training, outperforming prior test-time baselines on math and science benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11679","ref_index":62,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Explaining and Breaking the Safety-Helpfulness Ceiling via Preference Dimensional Expansion","primary_cat":"cs.AI","submitted_at":"2026-05-12T07:38:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MORA breaks the safety-helpfulness ceiling in LLMs by pre-sampling single-reward prompts and rewriting them to incorporate multi-dimensional intents, delivering 5-12.4% gains in sequential alignment and 4.6% overall improvement in simultaneous alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11636","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Seir\\^enes: Adversarial Self-Play with Evolving Distractions for LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-12T06:58:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Seirênes trains LLMs via adversarial self-play to generate and overcome evolving distractions, producing gains of 7-10 points on math reasoning benchmarks and exposing blind spots in larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09959","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"G-Zero: Self-Play for Open-Ended Generation from Zero Data","primary_cat":"cs.LG","submitted_at":"2026-05-11T04:12:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"G-Zero uses the Hint-δ intrinsic reward to drive co-evolution between a Proposer and Generator via GRPO and DPO, providing a theoretical suboptimality guarantee for self-improvement from internal dynamics alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09423","ref_index":38,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SimWorld Studio: Automatic Environment Generation with Evolving Coding Agent for Embodied Agent Learning","primary_cat":"cs.AI","submitted_at":"2026-05-10T08:51:50+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SimWorld Studio deploys an evolving coding agent to create adaptive 3D environments that co-evolve with embodied learners, delivering 18-point success-rate gains over fixed environments in navigation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07465","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SEIF: Self-Evolving Reinforcement Learning for Instruction Following","primary_cat":"cs.CL","submitted_at":"2026-05-08T09:13:12+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEIF creates a self-reinforcing loop in which an LLM alternately generates increasingly difficult instructions and learns to follow them better using reinforcement learning signals from its own judgments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06638","ref_index":71,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Can RL Teach Long-Horizon Reasoning to LLMs? Expressiveness Is Key","primary_cat":"cs.AI","submitted_at":"2026-05-07T17:48:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RL training compute for logical reasoning follows a power law in proof depth whose exponent rises with logic expressiveness, and more expressive training yields larger gains on downstream benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05546","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SPARK: Self-Play with Asymmetric Reward from Knowledge Graphs","primary_cat":"cs.AI","submitted_at":"2026-05-07T00:51:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPARK constructs unified knowledge graphs from multi-document scientific literature to ground self-play RL with asymmetric roles and verifiable rewards, outperforming flat-corpus baselines especially on longer-hop reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24819","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Programming with Data: Test-Driven Data Engineering for Self-Improving LLMs from Raw Corpora","primary_cat":"cs.SE","submitted_at":"2026-04-27T14:05:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Structured knowledge extracted from corpora enables test-driven data engineering for LLMs by mapping training data to source code, model training to compilation, benchmarking to unit testing, and failures to targeted data repairs, demonstrated across 16 disciplines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20987","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Co-Evolving LLM Decision and Skill Bank Agents for Long-Horizon Tasks","primary_cat":"cs.AI","submitted_at":"2026-04-22T18:17:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"COSPLAY co-evolves an LLM decision agent with a skill bank agent to improve long-horizon game performance, reporting over 25.1% average reward gains versus frontier LLM baselines on single-player benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20209","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scaling Self-Play with Self-Guidance","primary_cat":"cs.LG","submitted_at":"2026-04-22T05:50:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SGS adds self-guidance to LLM self-play for Lean4 theorem proving, surpassing RL baselines and enabling a 7B model to outperform a 671B model after 200 rounds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20051","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Bootstrapping Post-training Signals for Open-ended Tasks via Rubric-based Self-play on Pre-training Text","primary_cat":"cs.CL","submitted_at":"2026-04-21T23:21:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"POP bootstraps post-training signals for open-ended LLM tasks by synthesizing rubrics during self-play on pretraining corpus, yielding performance gains on Qwen-2.5-7B across healthcare QA, creative writing, and instruction following.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19341","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Evaluation-driven Scaling for Scientific Discovery","primary_cat":"cs.LG","submitted_at":"2026-04-21T11:24:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SimpleTES scales test-time evaluation in LLMs to discover state-of-the-art solutions on 21 scientific problems across six domains, outperforming frontier models and optimization pipelines with examples like 2x faster LASSO and new Erdos constructions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18493","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Too Correct to Learn: Reinforcement Learning on Saturated Reasoning Data","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:43:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A parameter-free sampling strategy called CUTS combined with Mixed-CUTS training prevents mode collapse in RL for saturated LLM reasoning tasks and raises AIME25 Pass@1 accuracy by up to 15.1% over standard GRPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18320","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"EVE: Verifiable Self-Evolution of MLLMs via Executable Visual Transformations","primary_cat":"cs.CV","submitted_at":"2026-04-20T14:20:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"EVE enables verifiable self-evolution of MLLMs by using a Challenger-Solver architecture to generate dynamic executable visual transformations that produce VQA problems with absolute execution-verified ground truth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18131","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training LLM Agents for Spontaneous, Reward-Free Self-Evolution via World Knowledge Exploration","primary_cat":"cs.AI","submitted_at":"2026-04-20T11:54:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM agents trained with a task-success reward on self-generated knowledge can spontaneously explore and adapt to new environments without any rewards or instructions at inference, yielding 20% gains on web tasks and allowing a 14B model to beat Gemini-2.5-Flash.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17928","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HEALing Entropy Collapse: Enhancing Exploration in Few-Shot RLVR via Hybrid-Domain Entropy Dynamics Alignment","primary_cat":"cs.LG","submitted_at":"2026-04-20T08:09:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HEAL mitigates entropy collapse in few-shot RLVR by selectively adding general-domain data and aligning trajectory-level entropy dynamics, matching full-shot performance with 32 target samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14054","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"$\\pi$-Play: Multi-Agent Self-Play via Privileged Self-Distillation without External Data","primary_cat":"cs.LG","submitted_at":"2026-04-15T16:34:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"π-Play uses self-generated question construction paths as privileged information in multi-agent self-distillation to convert sparse-reward self-play into a dense-feedback loop, surpassing supervised search agents and improving efficiency 2-3× over standard self-play.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07864","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ZeroCoder: Can LLMs Improve Code Generation Without Ground-Truth Supervision?","primary_cat":"cs.SE","submitted_at":"2026-04-09T06:24:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZeroCoder co-evolves coder and tester LLMs via self-generated code-test execution feedback to improve code generation up to 21.6% without ground-truth supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03472","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Vocabulary Dropout for Curriculum Diversity in LLM Co-Evolution","primary_cat":"cs.CL","submitted_at":"2026-04-03T21:40:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vocabulary dropout prevents diversity collapse in LLM co-evolution by masking proposer logits, yielding average +4.4 point solver gains on mathematical reasoning benchmarks at 8B scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}