{"total":12,"items":[{"citing_arxiv_id":"2605.22166","ref_index":10,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Adapting the Interface, Not the Model: Runtime Harness Adaptation for Deterministic LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T08:36:49+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19156","ref_index":14,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"How Far Are We From True Auto-Research?","primary_cat":"cs.AI","submitted_at":"2026-05-18T22:20:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ResearchArena shows that agent-generated papers fail top-tier acceptance standards primarily due to fabricated results, underpowered experiments, and plan-execution mismatches that vary sharply by agent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11922","ref_index":54,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"StepCodeReasoner: Aligning Code Reasoning with Stepwise Execution Traces via Reinforcement Learning","primary_cat":"cs.SE","submitted_at":"2026-05-12T10:36:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StepCodeReasoner aligns code reasoning with verifiable stepwise execution traces via print anchors and bi-level GRPO reinforcement learning, reaching SOTA results on CRUXEval (91.1%) and LiveCodeBench (86.5%) for a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08580","ref_index":39,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Slipstream: Trajectory-Grounded Compaction Validation for Long-Horizon Agents","primary_cat":"cs.MA","submitted_at":"2026-05-09T00:47:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Slipstream uses asynchronous compaction with trajectory-grounded judge validation to improve long-horizon agent accuracy by up to 8.8 percentage points and reduce latency by up to 39.7%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05704","ref_index":8,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"SafeHarbor: Hierarchical Memory-Augmented Guardrail for LLM Agent Safety","primary_cat":"cs.CR","submitted_at":"2026-05-07T05:50:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SafeHarbor introduces a hierarchical memory-augmented guardrail with adversarial rule extraction and entropy-driven self-evolution to balance safety and utility in LLM agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03356","ref_index":61,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"POSTCONDBENCH: Benchmarking Correctness and Completeness in Formal Postcondition Inference","primary_cat":"cs.SE","submitted_at":"2026-05-05T04:29:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"POSTCONDBENCH is a new multilingual benchmark that evaluates LLM postcondition generation on real code using defect discrimination to assess completeness beyond surface matching.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00803","ref_index":20,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Can Coding Agents Reproduce Findings in Computational Materials Science?","primary_cat":"cs.SE","submitted_at":"2026-05-01T17:42:12+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"AutoMat benchmark shows current LLM coding agents achieve at most 54.1% success when reproducing computational materials science claims from papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00425","ref_index":26,"ref_count":2,"confidence":0.35,"is_internal_anchor":false,"paper_title":"AEM: Adaptive Entropy Modulation for Multi-Turn Agentic Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-01T05:54:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AEM adaptively modulates response-level entropy in agentic RL to improve credit assignment and exploration-exploitation balance, yielding gains on ALFWorld, WebShop, and SWE-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00347","ref_index":39,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Odysseus: Scaling VLMs to 100+ Turn Decision-Making in Games via Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-01T02:05:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Odysseus adapts PPO with a turn-level critic and leverages pretrained VLM action priors to train agents achieving at least 3x average game progress over frontier models in long-horizon Super Mario Land.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00072","ref_index":80,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"XekRung Technical Report","primary_cat":"cs.CR","submitted_at":"2026-04-30T11:50:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"XekRung achieves state-of-the-art performance on cybersecurity benchmarks among same-scale models via tailored data synthesis and multi-stage training while retaining strong general capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19606","ref_index":33,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"AblateCell: A Reproduce-then-Ablate Agent for Virtual Cell Repositories","primary_cat":"cs.AI","submitted_at":"2026-04-21T15:55:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AblateCell reproduces baselines in three single-cell perturbation repositories with 88.9% success and recovers ground-truth critical components with 93.3% accuracy via closed-loop ablation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.20857","ref_index":225,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory","primary_cat":"cs.CL","submitted_at":"2025-11-25T21:08:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Evo-Memory is a new streaming benchmark and evaluation framework for self-evolving memory in LLM agents, unifying over ten memory modules and introducing the ReMem pipeline for continual improvement on multi-turn and reasoning datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}