{"total":30,"items":[{"citing_arxiv_id":"2605.23904","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillOpt: Executive Strategy for Self-Evolving Agent Skills","primary_cat":"cs.AI","submitted_at":"2026-05-22T17:59:50+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23899","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Raw Experience to Skill Consumption: A Systematic Study of Model-Generated Agent Skills","primary_cat":"cs.AI","submitted_at":"2026-05-22T17:59:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A systematic study across five domains finds model-generated skills yield average gains but non-uniform negative transfer, with a meta-skill improving extraction quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22148","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ratchet: A Minimal Hygiene Recipe for Self-Evolving LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T08:20:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Ratchet provides a minimal hygiene recipe for self-managing skill libraries in frozen LLM agents, delivering +0.328 rolling-mean pass@1 gain on MBPP+ hard-100 and +0.22 peak lift on SWE-bench Verified.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21463","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mem-$\\pi$: Adaptive Memory through Learning When and What to Generate","primary_cat":"cs.CL","submitted_at":"2026-05-20T17:51:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mem-π is a framework using a dedicated model and decision-content decoupled RL to generate context-specific guidance on demand for LLM agents, outperforming retrieval baselines by over 30% on web navigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20025","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AutoResearchClaw: Self-Reinforcing Autonomous Research with Human-AI Collaboration","primary_cat":"cs.AI","submitted_at":"2026-05-19T15:49:51+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19576","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Library Drift: Diagnosing and Fixing a Silent Failure Mode in Self-Evolving LLM Skill Libraries","primary_cat":"cs.AI","submitted_at":"2026-05-19T09:19:56+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18729","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robo-Cortex: A Self-Evolving Embodied Agent via Dual-Grain Cognitive Memory and Autonomous Knowledge Induction","primary_cat":"cs.RO","submitted_at":"2026-05-18T17:52:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Robo-Cortex proposes a self-evolving embodied navigation agent using dual-grain cognitive memory and autonomous knowledge induction from trajectories, reporting SPL gains on IGNav, AR, AEQA and preliminary real-robot tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18930","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OEP: Poisoning Self-Evolving LLM Agents via Locally Correct but Non-Transferable Experiences","primary_cat":"cs.CR","submitted_at":"2026-05-18T14:08:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OEP poisons self-evolving LLM agents by constructing clean edge-case experiences that appear locally valid yet cause harmful over-generalization during reflection, achieving over 50% attack success rate on GPT-4o agents across three domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17721","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EXG: Self-Evolving Agents with Experience Graphs","primary_cat":"cs.AI","submitted_at":"2026-05-18T00:50:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EXG is an experience graph framework for self-evolving LLM agents that supports online real-time growth and offline reuse to enhance solution quality and efficiency on code generation and reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15384","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Is One Score Enough? Rethinking the Evaluation of Sequentially Evolving LLM Memory","primary_cat":"cs.LG","submitted_at":"2026-05-14T20:15:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SeqMem-Eval reveals that high final accuracy in sequential LLM memory tasks often coexists with substantial forgetting and negative transfer, exposing stability-adaptability trade-offs hidden by standard aggregate metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14477","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Test-Time Learning with an Evolving Library","primary_cat":"cs.LG","submitted_at":"2026-05-14T07:18:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EvoLib enables LLMs to accumulate, reuse, and evolve knowledge abstractions from inference trajectories at test time, yielding substantial gains on math reasoning, code generation, and agentic benchmarks without parameter updates or supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14133","ref_index":72,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClawForge: Generating Executable Interactive Benchmarks for Command-Line Agents","primary_cat":"cs.AI","submitted_at":"2026-05-13T21:34:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ClawForge is a generator framework that creates reproducible executable benchmarks for command-line agents under state conflict, with ClawForge-Bench showing frontier models reach at most 45.3% strict accuracy and that state inspection drives most performance gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13941","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvolveMem:Self-Evolving Memory Architecture via AutoResearch for LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:12:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EvolveMem enables autonomous self-evolution of LLM memory retrieval configurations via LLM diagnosis and safeguards, delivering 25.7% gains over strong baselines on LoCoMo and 18.9% on MemBench with positive cross-benchmark transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12039","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillGraph: Skill-Augmented Reinforcement Learning for Agents via Evolving Skill Graphs","primary_cat":"cs.CL","submitted_at":"2026-05-12T12:21:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillGraph represents skills as nodes in an evolving directed graph with typed dependency edges and updates the graph from RL trajectories to boost compositional task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10923","ref_index":57,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Skill Lifecycle Management for Agentic Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:55:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SLIM dynamically optimizes the active external skill set in agentic RL via leave-one-skill-out marginal contribution estimates and lifecycle operations, delivering a 7.1% average gain over baselines on ALFWorld and SearchQA while showing some skills remain externally useful.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"ALFWorld covers Pick, Look, Clean, Heat, Cool, and Pick2 household tasks, while SearchQA covers NQ [25], TriviaQA [24], PopQA [35], HotpotQA [64], 2Wiki [20], MuSiQue [51], and Bamboogle [42]. We compare against prompt-based, agent/memory-based, and RL-based baselines, including ReAct [ 65], Reflexion [49], Mem0 [10], ExpeL [72], GRPO [46], EvolveR [57], SkillRL [59], and Skill0 [33]. Full baseline details are provided in Appendix B.2. Evaluation.We report success rate on both benchmarks. A trial succeeds if the agent completes the ALFWorld objective or returns a correct SearchQA final answer under the shared benchmark 6 Table 1: Main results on ALFWorld and SearchQA. All entries report success rate."},{"citing_arxiv_id":"2605.10663","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evolving-RL: End-to-End Optimization of Experience-Driven Self-Evolving Capability within Agents","primary_cat":"cs.AI","submitted_at":"2026-05-11T14:43:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Evolving-RL jointly optimizes experience extraction and utilization in LLM agents via RL with separate evaluation signals, delivering up to 98.7% relative gains on out-of-distribution tasks in ALFWorld and Mind2Web.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"demonstrated that injecting past experiences can significantly enhance downstream decision-making, their effectiveness is ultimately bounded by the underlying model's ability to extract and leverage these experiences [ 18]-a process that heavily relies on the model possessing robust in-context learning [4] and abstract reasoning capabilities. Several recent studies [30, 29] have explored reinforcement learning as a way to enhance the model's ability to utilize experience. However, these methods do not optimize self-evolution as a unified process. They improve only the utilization phase, while relying on stronger external models or hand-crafted filtering mechanisms to ensure the quality of extracted experience. Such a decoupled"},{"citing_arxiv_id":"2605.10064","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MAGE: Multi-Agent Self-Evolution with Co-Evolutionary Knowledge Graphs","primary_cat":"cs.AI","submitted_at":"2026-05-11T06:39:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MAGE uses a four-subgraph co-evolutionary knowledge graph plus dual bandits to externalize and retrieve experience for stable self-evolution of frozen language-model agents, showing gains on nine diverse benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"be retained across iterations, and in what form should it be represented?Existing self-evolving agents typically answer this question by storing cross-iteration knowledge in one of three forms. Natural-language feedback [25, 15] is easy to generate but can collapse into generic advice when the teacher is weak or the failure mode is systematic. Flat episodic memory [ 21, 32, 23, 2] stores trajectories or principles, but provides little structure for curriculum selection or dependency-aware reuse. Implicit reinforcement signals [19, 5, 4, 11] can drive parameter updates, but make the learned curriculum difficult to inspect and require a malleable student model. Motivated by these limitations, Preprint. arXiv:2605.10064v1 [cs."},{"citing_arxiv_id":"2605.08703","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RewardHarness: Self-Evolving Agentic Post-Training","primary_cat":"cs.AI","submitted_at":"2026-05-09T05:32:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RewardHarness self-evolves a tool-and-skill library from 100 preference examples to reach 47.4% accuracy on image-edit evaluation, beating GPT-5, and yields stronger RL-tuned models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Rishabh Singh, and Sida I Wang. Swe-rl: Advancing llm reasoning via reinforcement learning on open software evolution.arXiv preprint arXiv:2502.18449, 2025. [29] Keming Wu, Sicong Jiang, Max Ku, Ping Nie, Minghao Liu, and Wenhu Chen. Editreward: A human-aligned reward model for instruction-guided image editing.arXiv preprint arXiv:2509.26346, 2025. [30] Rong Wu, Xiaoman Wang, Jianbiao Mei, Pinlong Cai, Daocheng Fu, Cheng Yang, Licheng Wen, Xuemeng Yang, Yufan Shen, Yuxin Wang, et al. Evolver: Self-evolving llm agents through an experience-driven lifecycle.arXiv preprint arXiv:2510.16079, 2025. [31] Peng Xia, Kaide Zeng, Jiaqi Liu, Can Qin, Fang Wu, Yiyang Zhou, Caiming Xiong, and Huaxiu Yao. Agent0: Unleashing"},{"citing_arxiv_id":"2605.08693","ref_index":24,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillMaster: Toward Autonomous Skill Mastery in LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-09T05:03:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillMaster enables LLM agents to autonomously develop skills via trajectory review, counterfactual evaluation, and DualAdv-GRPO training, boosting success rates by 8.8% on ALFWorld and 9.3% on WebShop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08013","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning CLI Agents with Structured Action Credit under Selective Observation","primary_cat":"cs.AI","submitted_at":"2026-05-08T17:02:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CLI agents trained with RL benefit from selective observation via σ-Reveal and structured credit assignment via A³ that leverages AST action sub-chains and trajectory margins.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Experiments use six benchmark streams normalized to a unified shell interface over filesystem workspaces. Five come from published resources, with AgentBench contributing operating-system and database tasks [29], DataBench covering tabular question answering [13], EHRCon testing clinical note-EHR consistency [23, 24, 12, 19], and TableBench targeting structured table reasoning [59]. Each instance is mapped to a common schema with a user instruction, reference bash solution, initial and optional gold file trees, and a programmatic reward over executed outputs or workspace state. ShellOps contains a 1624 task standard corpus, with 714 in-distribution tasks used for scalable training and evaluation, and ShellOps-Pro adds 150 harder out-of-distribution tasks, whose workspaces contain"},{"citing_arxiv_id":"2605.07358","ref_index":122,"ref_count":4,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Comprehensive Survey on Agent Skills: Taxonomy, Techniques, and Applications","primary_cat":"cs.IR","submitted_at":"2026-05-08T07:10:26+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"more test-time compute-as visible in AgentEvolver [71], Explorer [122], and ReasoningBank [83], with the UI data- scale study [123] showing that scale alone can move metrics substantially. Writable memories, bridge objects, and skill repositories further introduce long-horizon trust requirements that current evaluations only partly address [30], [65], [92], [124]-[126]. Robust self-evolution therefore depends on trust- worthy lineage, interpretable revision, and causal evaluation, not on persistence and reuse alone. VIII. FUTURERESEARCHDIRECTIONS We outline five directions for advancing agent skill research. JOURNAL OF LATEX CLASS FILES, VOL. 18, NO. 9, SEPTEMBER 2020 17 Unified Skill Schema.Despite broad adoption of the skill"},{"citing_arxiv_id":"2605.07180","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Agent Routing From Early Experience","primary_cat":"cs.CL","submitted_at":"2026-05-08T03:18:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BoundaryRouter routes queries to LLM or agent using early experience memory from a seed set, cutting inference time 60.6% versus always using agents and raising performance 28.6% versus always using direct LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06130","ref_index":48,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill1: Unified Evolution of Skill-Augmented Agents via Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-07T12:33:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Skill1 trains a single RL policy to co-evolve skill selection, utilization, and distillation in language model agents from one task-outcome reward, using low-frequency trends to credit selection and high-frequency variation to credit distillation, outperforming baselines on ALFWorld and WebShop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27221","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Web2BigTable: A Bi-Level Multi-Agent LLM System for Internet-Scale Information Search and Extraction","primary_cat":"cs.AI","submitted_at":"2026-04-29T21:43:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Web2BigTable introduces a bi-level multi-agent system that achieves new state-of-the-art results on wide-coverage and deep web-to-table search benchmarks through orchestration, coordination, and closed-loop reflection.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"provides a highly flexible, training-free alternative that overcomes static reasoning bottlenecks, seamlessly scaling to the extreme breadth of open-web information extraction. Self-Evolving AgentsAn expanding body of literature investigates how LLM agents progres- sively enhance their capabilities through experiential learning without parameter updates [6]. Frameworks such as SAMULE [7] and EvolveR [20] distil transferable insights through multi-level reflection and closed-loop self-distillation. Concurrently, reinforcement learning is increasingly em- ployed to construct and exploit structured skill libraries from sequential rollouts, as demonstrated by SAGE [17] and SkillRL [22]. Other approaches facilitate continual learning via artefact-centric"},{"citing_arxiv_id":"2604.17091","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GenericAgent: A Token-Efficient Self-Evolving LLM Agent via Contextual Information Density Maximization (V1.0)","primary_cat":"cs.CL","submitted_at":"2026-04-18T17:59:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GenericAgent outperforms other LLM agents on long-horizon tasks by maximizing context information density with fewer tokens via minimal tools, on-demand memory, trajectory-to-SOP evolution, and compression.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"this space, Agent-Pro [40] studies policy-level reflection and optimization, showing that agents can revise their own behavioral policies without updating model parameters. Voyager [8] demonstrates a stronger form of accumulation in a specialized environment by continually storing verified executable skills. Broader general-purpose work mostly evolves the agent through textual abstractions. EvolveR [41], FLEX [42], AgentEvolver [43], and experience-driven lifelong learning [44] all convert trajectories into strategic principles, reflections, or structured knowledge units that help later execution. This is an important step beyond simple history reuse, but in most cases the retained experience remains natural-language guidance rather than executable capability."},{"citing_arxiv_id":"2604.15877","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Experience Compression Spectrum: Unifying Memory, Skills, and Rules in LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-04-17T09:26:25+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10674","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill-SD: Skill-Conditioned Self-Distillation for Multi-turn LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-04-12T14:57:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Skill-SD turns an agent's completed trajectories into dynamic natural-language skills that condition only the teacher in self-distillation, yielding 14-42% gains over RL and OPSD baselines on multi-turn agent benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"Numerically, ρon t equals the GRPO ratio rt, but the role is different: ρon t weights the auxiliary reverse-KL term, whereas rt defines the clipped reinforcement-learning surrogate. Off-policy comparison branch.For the teacher-rollout comparison branch, µ=π tea θold (· | x,S(x),y <t), which yields ρoff t = πstu θ (yt |x,y <t) sg \u0010 πtea θold (yt |x,S(x),y <t) \u0011 . (32) This is the off-policy correction emphasized in RPG-style analyses (Zhang et al., 2026b): without it, the differentiated k3 term would not follow the intended reverse-KL transfer direction under teacher sampling. D.4 Why SDL and GRPO need distinct interpretations Skill-SD uses two different quantities for two different jobs: • The main-branch SDL weight ρon"},{"citing_arxiv_id":"2605.02913","ref_index":141,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Generate, Filter, Control, Replay: A Comprehensive Survey of Rollout Strategies for LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-08T00:53:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"This survey introduces the Generate-Filter-Control-Replay (GFCR) taxonomy to structure rollout pipelines for RL-based post-training of reasoning LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.08403","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPIRAL: Self-Evolving Action-Conditioned Video Generation via Reflective Planning Agents","primary_cat":"cs.CV","submitted_at":"2026-03-09T14:00:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SPIRAL is a closed-loop think-act-reflect framework using PlanAgent, VideoGenerator, and CriticAgent plus GRPO self-evolution to improve long-horizon action-conditioned video generation, with new dataset and benchmark showing gains over open-loop baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.21678","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic Learner with Grow-and-Refine Multimodal Semantic Memory","primary_cat":"cs.AI","submitted_at":"2025-11-26T18:55:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViLoMem is a dual-stream grow-and-refine memory system that separates visual and logical error patterns in MLLMs to improve pass@1 accuracy and reduce repeated mistakes across six multimodal benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}