{"total":18,"items":[{"citing_arxiv_id":"2605.12925","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentLens: Revealing The Lucky Pass Problem in SWE-Agent Evaluation","primary_cat":"cs.SE","submitted_at":"2026-05-13T03:00:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"10.7% of passing SWE-agent trajectories are Lucky Passes with chaotic behaviors, and a quality score based on process references changes model rankings across eight backends.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12913","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Revisiting DAgger in the Era of LLM-Agents","primary_cat":"cs.LG","submitted_at":"2026-05-13T02:40:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DAgger-style training with turn-level policy interpolation raises 4B and 8B LLM agents to 27.3% and 29.8% on SWE-bench Verified, beating several larger published systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11922","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StepCodeReasoner: Aligning Code Reasoning with Stepwise Execution Traces via Reinforcement Learning","primary_cat":"cs.SE","submitted_at":"2026-05-12T10:36:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StepCodeReasoner aligns code reasoning with verifiable stepwise execution traces via print anchors and bi-level GRPO reinforcement learning, reaching SOTA results on CRUXEval (91.1%) and LiveCodeBench (86.5%) for a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09134","ref_index":103,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BoostAPR: Boosting Automated Program Repair via Execution-Grounded Reinforcement Learning with Dual Reward Models","primary_cat":"cs.AI","submitted_at":"2026-05-09T19:31:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BoostAPR boosts automated program repair by training a sequence-level assessor and line-level credit allocator from execution outcomes, then applying them in PPO to reach 40.7% on SWE-bench Verified.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08703","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RewardHarness: Self-Evolving Agentic Post-Training","primary_cat":"cs.AI","submitted_at":"2026-05-09T05:32:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RewardHarness self-evolves a tool-and-skill library from 100 preference examples to reach 47.4% accuracy on image-edit evaluation, beating GPT-5, and yields stronger RL-tuned models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05460","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agentic Discovery of Exchange-Correlation Density Functionals","primary_cat":"cs.AI","submitted_at":"2026-05-06T21:36:43+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An agentic LLM system discovers the XC functional SAFS26-a that improves on the ωB97M-V baseline by roughly 9% on a held-out thermochemistry dataset while warning that such systems can exploit unphysical shortcuts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01208","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Faithful Mobile GUI Agents with Guided Advantage Estimator","primary_cat":"cs.AI","submitted_at":"2026-05-02T02:54:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Faithful-Agent raises Trap SR in GUI agents from 13.88% to 80.21% via faithfulness-oriented SFT and GuAE-enhanced RFT with consistency rewards while retaining general performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00433","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Improving LLM Code Generation via Requirement-Aware Curriculum Reinforcement Learning","primary_cat":"cs.SE","submitted_at":"2026-05-01T06:10:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"REC RL improves LLM code generation by automatically assessing and optimizing requirement difficulty with adaptive curriculum sampling, yielding 1.23-5.62% Pass@1 gains over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18027","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CodePivot: Bootstrapping Multilingual Transpilation in LLMs via Reinforcement Learning without Parallel Corpora","primary_cat":"cs.SE","submitted_at":"2026-04-20T09:52:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CodePivot uses Python as a pivot language plus an Aggressive-Partial-Functional RL reward to train a 7B model that outperforms much larger LLMs on multilingual code transpilation without parallel corpora.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16804","ref_index":70,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AutoOR: Scalably Post-training LLMs to Autoformalize Operations Research Problems","primary_cat":"cs.LG","submitted_at":"2026-04-18T03:24:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AutoOR uses synthetic data generation and RL post-training with solver feedback to enable 8B LLMs to autoformalize linear, mixed-integer, and non-linear OR problems, matching larger models on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13934","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Enabling An Artificial Self-Construction Software Life-cycle via Autopoietic Architectures","primary_cat":"cs.SE","submitted_at":"2026-04-15T14:46:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes autopoietic architectures for self-constructing software as a fundamental shift in the SDLC, leveraging foundation models for autonomous evolution and maintenance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09107","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TensorHub: Scalable and Elastic Weight Transfer for LLM RL Training","primary_cat":"cs.DC","submitted_at":"2026-04-10T08:40:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TensorHub uses Reference-Oriented Storage to enable scalable weight transfer in LLM RL training by referencing replicated GPU weights, achieving up to 19x reduction in cross-datacenter stall time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07341","ref_index":86,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReCodeAgent: A Multi-Agent Workflow for Language-agnostic Translation and Validation of Large-scale Repositories","primary_cat":"cs.SE","submitted_at":"2026-04-08T17:54:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ReCodeAgent uses a multi-agent system to translate and validate large code repositories across multiple programming languages, achieving 60.8% higher test pass rates than prior neuro-symbolic and agentic methods on 118 real-world projects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06762","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ARuleCon: Agentic Security Rule Conversion","primary_cat":"cs.CR","submitted_at":"2026-04-08T07:28:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ARuleCon uses AI agents plus execution-based checks to convert SIEM rules across vendors with 15% higher fidelity than standard LLM translation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05820","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reinforcement Learning with Negative Tests as Completeness Signal for Formal Specification Synthesis","primary_cat":"cs.SE","submitted_at":"2026-04-07T12:53:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SpecRL uses the fraction of negative tests rejected by candidate specifications as a reward signal in RL training to produce stronger and more verifiable formal specifications than prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05341","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Curr-RLCER:Curriculum Reinforcement Learning For Coherence Explainable Recommendation","primary_cat":"cs.IR","submitted_at":"2026-04-07T02:25:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Curr-RLCER applies curriculum reinforcement learning with coherence-driven rewards to align generated explanations with predicted ratings in explainable recommendation systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02909","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Delay, Plateau, or Collapse: Evaluating the Impact of Systematic Verification Error on RLVR","primary_cat":"cs.LG","submitted_at":"2026-04-06T15:02:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Systematic false positives in verifiers can cause RLVR training to reach suboptimal plateaus or collapse, with outcomes driven by error patterns rather than overall error rate.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.10978","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Group-in-Group Policy Optimization for LLM Agent Training","primary_cat":"cs.LG","submitted_at":"2025-05-16T08:26:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GiGPO adds a hierarchical grouping mechanism to group-based RL so that LLM agents receive both global trajectory and local step-level credit signals, yielding >12% gains on ALFWorld and >9% on WebShop over GRPO while keeping the same rollout and memory footprint.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}