{"total":172,"items":[{"citing_arxiv_id":"2605.23590","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Co-ReAct: Rubrics as Step-Level Collaborators for ReAct Agents","primary_cat":"cs.AI","submitted_at":"2026-05-22T12:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Co-ReAct adds step-level rubric guidance to ReAct agents via a GRPO-trained generator using list-wise ranking rewards, yielding consistent gains on DeepResearchBench and SQA-CS-V2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20994","ref_index":92,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Context-Invariant Safety Alignment for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T10:33:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces AIR, an asymmetric regularization that anchors open-ended safety prompts to verifiable ones via stop-gradient, improving invariance and accuracy when combined with group preference optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20315","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mix-Quant: Quantized Prefilling, Precise Decoding for Agentic LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-19T17:50:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mix-Quant quantizes prefilling to NVFP4 and keeps BF16 for decoding in agentic LLMs, achieving up to 3x prefilling speedup while largely preserving task performance on long-context and agentic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19762","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Really Improves Mathematical Reasoning: Structured Reasoning Signals Beyond Pure Code","primary_cat":"cs.AI","submitted_at":"2026-05-19T12:37:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Controlled experiments show structured reasoning traces and higher-density math-domain samples improve mathematical reasoning more than pure executable code, with internal routing patterns reflecting these data effects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19528","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Camera-Robust 3D Localization: Equation-Anchored Tool-Use for MLLMs","primary_cat":"cs.CV","submitted_at":"2026-05-19T08:30:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes an equation-anchored tool-use method for MLLMs that writes the pinhole back-projection equation in Chain-of-Thought and substitutes retrieved camera intrinsics and depths to achieve robustness in 3D object detection and visual grounding under rescaled intrinsics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19196","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Time to REFLECT: Can We Trust LLM Judges for Evidence-based Research Agents?","primary_cat":"cs.CL","submitted_at":"2026-05-18T23:55:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"REFLECT benchmark shows current LLM judges achieve below 55% accuracy detecting failures in evidence-based research agents, especially on evidence verification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19156","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Far Are We From True Auto-Research?","primary_cat":"cs.AI","submitted_at":"2026-05-18T22:20:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ResearchArena shows that agent-generated papers fail top-tier acceptance standards primarily due to fabricated results, underpowered experiments, and plan-execution mismatches that vary sharply by agent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18133","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Empirical Study of Privacy Leakage Chains via Prompt Injection in Black-Box Chatbot Environments","primary_cat":"cs.CR","submitted_at":"2026-05-18T09:38:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical demonstration that prompt injection combined with web-tool use creates a feasible privacy-leakage chain in deployed black-box chatbot agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16565","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skim: Speculative Execution for Fast and Efficient Web Agents","primary_cat":"cs.AI","submitted_at":"2026-05-15T19:12:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Skim profiles website patterns offline to enable fast-path speculative execution for web agents, cutting median cost by 1.9x and latency by 33.4% with no accuracy loss on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16217","ref_index":16,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Argus: Evidence Assembly for Scalable Deep Research Agents","primary_cat":"cs.CL","submitted_at":"2026-05-15T17:29:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Argus coordinates a Navigator and multiple Searchers via an evidence graph for deep research, reporting average gains of 5.5 points with one Searcher and 12.7 points with eight parallel Searchers across eight benchmarks, reaching 86.2 on BrowseComp with 64 Searchers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14133","ref_index":56,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClawForge: Generating Executable Interactive Benchmarks for Command-Line Agents","primary_cat":"cs.AI","submitted_at":"2026-05-13T21:34:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ClawForge is a generator framework that creates reproducible executable benchmarks for command-line agents under state conflict, with ClawForge-Bench showing frontier models reach at most 45.3% strict accuracy and that state inspection drives most performance gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14002","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PolitNuggets: Benchmarking Agentic Discovery of Long-Tail Political Facts","primary_cat":"cs.AI","submitted_at":"2026-05-13T18:09:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PolitNuggets is a multilingual benchmark showing that AI agents struggle with fine-grained accuracy and efficiency when discovering long-tail political facts for elite biographies, linking performance to short-context extraction, multilingual robustness, and tool use.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13706","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Identifying AI Web Scrapers Using Canary Tokens","primary_cat":"cs.CR","submitted_at":"2026-05-13T15:53:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Unique canary tokens served to visiting scrapers can be recovered from LLM outputs to identify which scrapers feed data to which of 22 tested production LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13537","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Temper and Tilt Lead to SLOP: Reward Hacking Mitigation with Inference-Time Alignment","primary_cat":"cs.LG","submitted_at":"2026-05-13T13:47:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Temperature adjustment on the reference model generalizes inference-time alignment to SLOP ensembles of reward models, with a calibration algorithm that improves robustness to reward hacking while preserving alignment performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12882","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CiteVQA: Benchmarking Evidence Attribution for Trustworthy Document Intelligence","primary_cat":"cs.CL","submitted_at":"2026-05-13T01:54:42+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"CiteVQA requires models to cite specific document regions with bounding boxes alongside answers and finds that even the strongest MLLMs frequently cite the wrong region, with top SAA scores of only 76.0 for closed models and 22.5 for open-source ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11611","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CuSearch: Curriculum Rollout Sampling via Search Depth for Agentic RAG","primary_cat":"cs.AI","submitted_at":"2026-05-12T06:42:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CuSearch reallocates rollout budget in RLVR toward deeper-search trajectories as a proxy for retrieval supervision density, yielding up to 11.8 exact-match gains over uniform GRPO sampling on ZeroSearch.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"If that bucket does not contain enough trajectories, the remaining bud- get is redistributed greedily to buckets with progressively smaller search counts according to the SDGA repair procedure. For example, when Smax = 5, the corresponding target allocation and priorities can be written as target lens= [0, 0, 0, 0, 0,K],priorities= [6, 5, 4, 3, 2, 1]. (8) C.3 SDGA-Phase SDGA-Phase is a phase-adaptive instantiation of SDGA. It maintains a global phase variable k∈ { 0, 1,. . ., Smax − 1}, initialized to 0 and constrained to be monotonically non-decreasing. At phasek, the target allocation places the full selection budget on bucketk+1: ts = \u001aKifs=k+1, 0 otherwise, (9) where K denotes the number of selected trajectories per update."},{"citing_arxiv_id":"2605.10862","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RUBEN: Rule-Based Explanations for Retrieval-Augmented LLM Systems","primary_cat":"cs.CL","submitted_at":"2026-05-11T17:10:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RUBEN discovers minimal rule sets explaining RAG LLM outputs via novel pruning and applies them to evaluate LLM safety against adversarial injections.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10834","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Controlled to the Wild: Evaluation of Pentesting Agents for the Real-World","primary_cat":"cs.AI","submitted_at":"2026-05-11T16:50:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A practical evaluation protocol for AI pentesting agents that uses validated vulnerability discovery, LLM semantic matching, and bipartite scoring to assess performance in realistic, complex targets.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[21] Reiichiro Nakano, Jacob Hilton, S. Balaji, Jeff Wu, Ouyang Long, Christina Kim, Christopher Hesse, Shantanu Jain, Vineet Kosaraju, W. Saunders, Xu Jiang, K. Cobbe, Tyna Eloundou, Gretchen Krueger, Kevin Button, Matthew Knight, Benjamin Chess, and John Schulman. Webgpt: Browser-assisted question- answering with human feedback.ArXiv, abs/2112.09332, dec 2021. [22] Sho Nakatani. Rapidpen: Fully Automated IP-to-Shell Penetration Testing with LLM-based Agents.ArXiv, abs/2502.16730, feb 2025. [23] Yichen Pan, Dehan Kong, Sida Zhou, Cheng Cui, Yifei Leng, Bingqian Jiang, Hangyu Liu, Yanyi Shang, Shuyan Zhou, Tongshuang Wu, and Zhengyang Wu. Webcanvas: Benchmarking Web Agents in Online Environments.ArXiv, abs/2406."},{"citing_arxiv_id":"2605.10224","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hypothesis-Driven Deep Research with Large Language Models: A Structured Methodology for Automated Knowledge Discovery","primary_cat":"cs.AI","submitted_at":"2026-05-11T09:04:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HDRI is a six-principle eight-stage framework for hypothesis-organized LLM research featuring gap-driven iteration, traceable fact reasoning, and subject locking, realized in INFOMINER with reported gains in fact density and completeness.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"soning methods enable inference over structured knowledge bases through link prediction and log- ical rule learning [23]. Recent work on retrieval-augmented genera- tion (RAG) [24] has improved the factual accu- racy of LLM outputs by grounding generation in retrieved evidence. Self-RAG [25] further intro- duced self-reflective mechanisms for adaptive re- trieval. The WebGPT system [26] demonstrated that training LLMs to browse the web can im- prove factual accuracy through evidence-based answers. Our fact reasoning framework extends these approaches in several ways. First, we introduce traceable reasoning chainsthat explicitly con- nect each derived (implicit) fact to its eviden- tial basis and reasoning logic. Second, we imple-"},{"citing_arxiv_id":"2605.09725","ref_index":31,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Distillation with Best-of-N Teacher Rollout Selection","primary_cat":"cs.CV","submitted_at":"2026-05-10T19:49:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BRTS improves on-policy distillation by sampling multiple teacher rollouts and selecting the best one via a correctness-first then alignment priority rule, yielding gains on AIME and AMC math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09038","ref_index":22,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SearchSkill: Teaching LLMs to Use Search Tools with Evolving Skill Banks","primary_cat":"cs.AI","submitted_at":"2026-05-09T16:23:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SearchSkill improves exact match scores and retrieval efficiency on open-domain QA by conditioning LLM actions on skills from an evolving SkillBank updated from failure patterns via two-stage SFT.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[21] Reiichiro Nakano, Jacob Hilton, Suchir Balaji, Jeff Wu, Long Ouyang, Christina Kim, Christo- pher Hesse, Shantanu Jain, Vineet Kosaraju, William Saunders, Xu Jiang, Karl Cobbe, Tyna Eloundou, Gretchen Krueger, Kevin Button, Matthew Knight, Benjamin Chess, and John Schulman. Webgpt: Browser-assisted question-answering with human feedback, 2022. URL https://arxiv.org/abs/2112.09332. [22] Ofir Press, Muru Zhang, Sewon Min, Ludwig Schmidt, Noah A Smith, and Mike Lewis. Measuring and narrowing the compositionality gap in language models. InFindings of the Association for Computational Linguistics: EMNLP 2023, pages 5687-5711, 2023. [23] Yujia Qin, Shengding Hu, Yankai Lin, Weize Chen, Ning Ding, Ganqu Cui, Zheni Zeng, Yufei Huang, Chaojun Xiao, Chi Han, Yi Ren Fung, Yusheng Su, Huadong Wang, Cheng Qian,"},{"citing_arxiv_id":"2605.08769","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoMAS: Learning Execution-Time Workflows for Multi-Agent Systems","primary_cat":"cs.AI","submitted_at":"2026-05-09T07:55:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EvoMAS trains a workflow adapter with policy gradients to dynamically instantiate stage-specific multi-agent workflows from a fixed agent pool, using explicit task-state construction and terminal success signals, and outperforms static baselines on GAIA, HLE, and DeepResearcher.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"com/yoheinakajima/babyagi, 2023. [17] Reiichiro Nakano, Anya Hilgard, Suchir Krishna, Miles Song, Nathan Lambert, Ryan Carroll, John Liu, Niharika Madhusudhan, Daniel Bishop, Yujia Weng, Eric Zelikman, Maxwell Nye, Long Ouyang Zhou, Jong Huang, Claire Kure, and John Schulman. WebGPT: Browser-assisted question-answering with human feedback.arXiv preprint arXiv:2112.09332, 2021. [18] OpenAI. Gpt-4 technical report, 2023. [19] Charles Packer, Sarah Wooders, Kevin Lin, Vivian Fang, Shishir G Patil, Ion Stoica, and Joseph E Gonzalez. Memgpt: Towards llms as operating systems.arXiv preprint arXiv:2310.08560, 2023. [20] Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: Large language model connected with massive APIs."},{"citing_arxiv_id":"2605.08580","ref_index":84,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Slipstream: Trajectory-Grounded Compaction Validation for Long-Horizon Agents","primary_cat":"cs.MA","submitted_at":"2026-05-09T00:47:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Slipstream uses asynchronous compaction with trajectory-grounded judge validation to improve long-horizon agent accuracy by up to 8.8 percentage points and reduce latency by up to 39.7%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08037","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Pairs: Your Language Model is Secretly Optimizing a Preference Graph","primary_cat":"cs.LG","submitted_at":"2026-05-08T17:26:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GraphDPO generalizes pairwise DPO to a graph-structured Plackett-Luce objective over DAGs induced by rollout rankings, enforcing transitivity with linear complexity and recovering DPO as a special case.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Small-margin preferences still matter-if you train them right.arXiv preprint arXiv:2602.00954, 2026. [38] Hanshi Sun, Momin Haider, Ruiqi Zhang, Huitao Yang, Jiahao Qiu, Ming Yin, Mengdi Wang, Peter Bartlett, and Andrea Zanette. Fast best-of-n decoding via speculative rejection.Advances in Neural Information Processing Systems, 37:32630-32652, 2024. [39] Reiichiro Nakano et al. Webgpt: Browser-assisted question-answering with human feedback. arXiv preprint arXiv:2112.09332, 2021. [40] Taneesh Gupta, Rahul Madhavan, Xuchao Zhang, Nagarajan Natarajan, Chetan Bansal, and Saravan Rajmohan. Multi-preference optimization: Generalizing dpo via set-level contrasts. arXiv preprint arXiv:2412.04628, 2024. [41] Taneesh Gupta, Rahul Madhavan, Xuchao Zhang, Chetan Bansal, and Saravan Rajmohan."},{"citing_arxiv_id":"2605.07153","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Reasoning: Reinforcement Learning Unlocks Parametric Knowledge in LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-08T02:40:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RL on binary rewards boosts LLM factual recall by ~27% relative across models by redistributing probability mass to latent correct answers rather than acquiring new knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06216","ref_index":95,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TIDE: Every Layer Knows the Token Beneath the Context","primary_cat":"cs.CL","submitted_at":"2026-05-07T13:16:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TIDE augments standard transformers with per-layer token embedding injection via an ensemble of memory blocks and a depth-conditioned router to mitigate rare-token undertraining and contextual collapse.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05802","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Selective Rollout: Mid-Trajectory Termination for Multi-Sample Agent RL","primary_cat":"cs.LG","submitted_at":"2026-05-07T07:41:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A one-parameter early-termination gate based on mean pairwise prefix edit distance reduces wall-clock time by 10.7% and raises held-out success by 2.5 pp in GRPO on ALFWorld by cutting zero-advantage batch dilution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04542","ref_index":189,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Power Distribution Bridges Sampling, Self-Reward RL, and Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-06T06:42:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The power distribution is the target of power sampling, the closed-form solution to self-reward KL-regularized RL, and the basis for power self-distillation that matches sampling performance at lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03383","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoDecider: A Coarse-to-Fine Agentic Workflow for Explainable Lithology Classification","primary_cat":"cs.AI","submitted_at":"2026-05-05T05:42:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoDecider introduces a coarse-to-fine agentic workflow using LLMs for explainable lithology classification from well logs, combining a base classifier, tool-augmented reasoning, and geological refinement to outperform baselines on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03129","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PIIGuard: Mitigating PII Harvesting under Adversarial Sanitization","primary_cat":"cs.CR","submitted_at":"2026-05-04T20:13:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PIIGuard uses optimized hidden HTML fragments on webpages to block LLMs from leaking contact PII via indirect prompt injection, achieving at least 97% defense success across tested models while preserving benign QA utility.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02411","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FitText: Evolving Agent Tool Ecologies via Memetic Retrieval","primary_cat":"cs.AI","submitted_at":"2026-05-04T10:01:24+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"LLM cost per lineage: nt calls; retrievals: nt +1. IBI (Iterative Bootstrapped Improvement).IBI applies a single global refinement prompt across all descriptions simultaneously. It accumulates exemplar tool blurbs across iterations and optionally uses an LLM judge to check sufficiency. LetD (0) be the initial description set. At each turnt: B(t) =B (t−1) ∪ [ d∈D (t−1) {blurb(r):r∈RET(d,T,k)}, (12) D(t) =PARSEBLOCKS \u0010 LLMrefine(D(t−1),B (t),q) \u0011 . (13) LLM cost (worst case, nt turns, m descriptions): nt refinement calls +nt optional judge calls; retrievals:(n t +1)·m. Scattershot.Scattershot introduces population-level diversity by sampling multiple can- didate descriptions per ancestor in parallel at high temperature, then aggregating via population-level voting. 18 Preprint. Under review. Algorithm 2DFSDT Reasoning Loop with Dynamic Retrieval Require:Noden,D max,W,Q max, answer setS, memoryM, function schemaF Ensure:Backtrack distanceb; modifiesS,M,F 1:ifδ(n)≥D max orn.prunedthen 2:returnℓ prune 3:end if 4:ifn.terminalthen 5:S.add(n) 6:returnℓ ans 7:end if 8:fori←1 toWdo 9:if|S | ≥ S max orQ>Q max then return∞ 10:end if 11:ifch(n)̸=∅then 12:Inject diversity prompt summarising prior children 13:end if 14:o←LLM(H(n),F(n));Q←Q+1 15:ifocontains pseudo-tool blocksthen 16:n th ←Thought child 17:ifnot duplicate inMthen 18:tools←SELECTANDRUNSTRATEGY(o,F, LLM,M) 19:F(n th)← F(n th)∪tools 20:M.update(o) 21:end if 22:end if 23:foreach function call(a,x)∈odo 24:n a ←Action child;n x ←ActionInput child 25:(obs,κ)←EXEC(a,x,F(n)) 26:ifκ=1thenreplaceawith sentinel 27:end if 28:ifκ=3thenn x.terminal←True 29:end if 30:ifκ∈ {1, 2, 4}thenn x.pruned←True 31:end if 32:end for 33:foreach childc∈ch(n)do 34:b←DFSDT(c,D max,W,Q max,S,M,F) 35:if|S | ≥ S max then return∞ 36:end if 37:ifb>1then returnb−1 38:end if 39:end for 40:end for 41:return1 The following subroutines are shared between Scattershot and Memetic (Algorithm 1): For each ancestor a, let Ea ={blurb(r):r∈RET(a , T , k)"},{"citing_arxiv_id":"2605.01428","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hallucinations Undermine Trust; Metacognition is a Way Forward","primary_cat":"cs.CL","submitted_at":"2026-05-02T12:59:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs need metacognition to align expressed uncertainty with their actual knowledge boundaries, moving beyond knowledge expansion to reduce confident errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00505","ref_index":132,"ref_count":4,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM-Oriented Information Retrieval: A Denoising-First Perspective","primary_cat":"cs.IR","submitted_at":"2026-05-01T08:30:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Argues for a denoising-first paradigm in LLM-oriented information retrieval, framing challenges via a four-stage progression and providing a taxonomy of signal-to-noise optimization techniques across the pipeline.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"Closed-Loop Training (§3.5) Provenance & Trust Quality Canonicalization Temporal Validity Structure-aware Indexing Watermark [84]; SynthID [31]; DetectGPT [120]; MAGE [95]; Spiral [20]; Cocktail [30]; GLTR [47]; Model collapse [159]; LLM adoption [97]; CopyrightDetective [216, 217] MinHash [10]; Dedup data [87]; CCNet [191]; RefinedWeb [136]; FineWeb [135]; Dolma [161] HOH [130]; VersionRAG [67]; EraRAG [215]; Theanine [128]; STAR-RAG [233]; T-GRAG [90] GraphRAG [37]; GRAG [62]; LightRAG [53]; KG2RAG [232]; G-Retriever [58]; EDC [213]; RAKG [219]; RAPTOR [151]; PageIndex [221]; HippoRAG [54] Query Transformation Hybrid Retrieval Distractor-aware Training Fine-grained Matching HyDE [44]; Query2Doc [180]; Step-Back [226]; GenRead [207]; Self-Ask [140]; Enhance-QR [203];"},{"citing_arxiv_id":"2605.00425","ref_index":16,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AEM: Adaptive Entropy Modulation for Multi-Turn Agentic Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-01T05:54:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AEM adaptively modulates response-level entropy in agentic RL to improve credit assignment and exploration-exploitation balance, yielding gains on ALFWorld, WebShop, and SWE-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27962","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Language Models Refine Mechanical Linkage Designs Through Symbolic Reflection and Modular Optimisation","primary_cat":"cs.AI","submitted_at":"2026-04-30T14:56:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A modular LM-plus-optimizer system with symbolic abstraction reduces geometric error by up to 68% and improves structural validity by up to 134% over monolithic baselines across six motion targets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27955","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GUI Agents with Reinforcement Learning: Toward Digital Inhabitants","primary_cat":"cs.AI","submitted_at":"2026-04-30T14:51:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper delivers the first comprehensive overview of RL for GUI agents, organizing methods into offline, online, and hybrid strategies while analyzing trends in rewards, efficiency, and deliberation to outline a future roadmap.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27859","ref_index":61,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Agentic Reinforcement Learning In Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-04-30T13:43:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper reviews conceptual foundations, methodological innovations, effective designs, critical challenges, and future directions for LLM-based Agentic Reinforcement Learning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Tool utilization involves the agent learning when and how to interface with external APIs, calculators, or search engines, thereby overcoming the knowledge cut-off and factual hallucination issues inherent to standalone LLMs [10]. Meanwhile, persistent memory mechanisms, ranging from vector databases to in-context summarization, enable the agent to retain context across multiple turns of interaction [61], facilitating a form of experiential learning that mirrors biological cognition [127]. Through this synergistic architecture, Agentic RL effectively bridges the chasm Agentic RL, LLMs, Designs. 1 arXiv:2604.27859v2 [cs.AI] 6 May 2026 2 Fangming Cui, Ruixiao Zhu, Cheng Fang, Sunan Li, and Jiahong Li between passive language models and truly autonomous, self-improving systems capable of meta-reasoning."},{"citing_arxiv_id":"2604.27221","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Web2BigTable: A Bi-Level Multi-Agent LLM System for Internet-Scale Information Search and Extraction","primary_cat":"cs.AI","submitted_at":"2026-04-29T21:43:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Web2BigTable introduces a bi-level multi-agent system that achieves new state-of-the-art results on wide-coverage and deep web-to-table search benchmarks through orchestration, coordination, and closed-loop reflection.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"rows are submitted for evaluation. The auto-generated orchestrator skill selects entity-based decomposition with adaptive region splitting, achieving 93.8% Row F1 vs. 12.8%-26.8% for single-agent and skill-less baselines. 4 Related Work Autonomous Web Search and Deep Research AgentsEarly LLM web search focused on single-turn retrieval to mitigate hallucinations (e.g., WebGPT [15], WebGLM [14]). The paradigm subsequently shifted towards autonomous, multi-step web navigation, catalysed by benchmarks like WebArena [33] and Mind2Web [4]. Recently, this trajectory culminated in deep research systems designed for exhaustive, long-horizon investigations. Works like WebThinker [12] and Search-R1 [10] employ reinforcement learning for this purpose, while proprietary frameworks"},{"citing_arxiv_id":"2604.26590","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Recommendations for Efficient and Responsible LLM Adoption within Industrial Software Development","primary_cat":"cs.SE","submitted_at":"2026-04-29T12:15:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A multi-case study plus survey produces seven actionable recommendations for efficient and responsible LLM use in industrial software engineering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25707","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Citation Selection to Citation Absorption: A Measurement Framework for Generative Engine Optimization Across AI Search Platforms","primary_cat":"cs.IR","submitted_at":"2026-04-28T14:34:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A measurement study of 602 prompts across ChatGPT, Google AI Overview, and Perplexity finds that citation selection breadth and absorption depth diverge, with high-influence pages being longer, structured, and evidence-rich.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25562","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SnapGuard: Lightweight Prompt Injection Detection for Screenshot-Based Web Agents","primary_cat":"cs.CR","submitted_at":"2026-04-28T12:32:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SnapGuard detects prompt injection attacks on screenshot-based web agents via visual stability indicators and contrast-polarity textual signals, reaching F1 0.75 while running 8x faster than GPT-4o with no added memory cost.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"are made directly from rendered visual inputs, such assumptions no longer hold. Complementary efforts in the image domain attempt to bridge this gap by exploiting robustness discrepancies between benign and adversarial inputs through input mutations [48], intro- ducing smoothing-based mechanisms to suppress patch-style visual attacks [36], or jointly modeling unimodal and cross-modal risk sig- nals [30]. However, these approaches either rely on expensive VLM inference or target general vision-language model safety rather than prompt injection detection in web agent pipelines, limiting their applicability across diverse attack types. 3 Threat Model 3.1 Problem Formulation Screenshot-based Web Agent.The pipeline of screenshot-based web agent operates solely on rendered webpage screenshots rather"},{"citing_arxiv_id":"2604.25345","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Plausible but Wrong: A case study on Agentic Failures in Astrophysical Workflows","primary_cat":"cs.AI","submitted_at":"2026-04-28T08:01:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"CMBAgent achieves high accuracy on well-specified astrophysical tasks with context but generates silent, plausible-yet-incorrect outputs on reasoning-challenging problems, with no self-diagnosis of inconsistencies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24665","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking Source-Sensitive Reasoning in Turkish: Humans and LLMs under Evidential Trust Manipulation","primary_cat":"cs.CL","submitted_at":"2026-04-27T16:26:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Turkish speakers show a robust preference for -DI in high-trust contexts and -mIs in low-trust contexts, while LLMs exhibit inconsistent, often reversed, or base-rate-driven behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23338","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Systematic Survey of Security Threats and Defenses in LLM-Based AI Agents: A Layered Attack Surface Framework","primary_cat":"cs.CR","submitted_at":"2026-04-25T14:57:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new 7x4 taxonomy organizes agentic AI security threats by architectural layer and persistence timescale, revealing under-explored upper layers and missing defenses after surveying 116 papers.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[4], and increasingly coordinates with other autonomous agents to accomplish long-horizon tasks [5]-[9]. This shift is driven partly by the emergent capabilities that arise at K. Chu is with the Department of Computer Science and Engineering, Uni- versity of Connecticut, Storrs, CT 06269 USA. E-mail: kexin.chu@uconn.edu. scale [10] and partly by new infrastructure for tool and memory integration [11]. This architectural complexity generates vulnerabilities that areemergent,compositional, andtemporally extended. Security models designed for stateless systems cannot capture these properties. Consider three examples that motivate this survey: • An adversarially crafted document retrieved during a routine web search can silently corrupt an agent's long-term"},{"citing_arxiv_id":"2604.23283","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisable by Design: A Theory of Streaming LLM Agent Execution","primary_cat":"cs.LG","submitted_at":"2026-04-25T12:55:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LLM agents achieve greater flexibility during execution by classifying actions via a reversibility taxonomy and using an Earliest-Conflict Rollback algorithm that matches full-restart quality while wasting far less completed work.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21896","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nemobot Games: Crafting Strategic AI Gaming Agents for Interactive Learning with Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-04-23T17:46:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Nemobot is an LLM-powered platform for creating and refining strategic game agents across dictionary, solvable, heuristic, and learning-based games, moving toward self-programming AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20146","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAKE: Self-aware Knowledge Exploitation-Exploration for Grounded Multimodal Named Entity Recognition","primary_cat":"cs.IR","submitted_at":"2026-04-22T03:17:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SAKE is an agentic framework for GMNER that uses uncertainty-based self-awareness and reinforcement learning to balance internal knowledge exploitation with adaptive external exploration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19657","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An AI Agent Execution Environment to Safeguard User Data","primary_cat":"cs.CR","submitted_at":"2026-04-21T16:45:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GAAP guarantees confidentiality of private user data for AI agents by enforcing user-specified permissions deterministically through persistent information flow tracking, without trusting the agent or requiring attack-free models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Jamie Hayes, Michael Ilie, Juliette Pluto, Shuang Song, Harsh Chaud- hari, Ilia Shumailov, Abhradeep Thakurta, Kai Yuanqing Xiao, An- dreas Terzis, and Florian Tramèr. 2025. The Attacker Moves Sec- ond: Stronger Adaptive Attacks Bypass Defenses Against Llm Jail- breaks and Prompt Injections. (2025). arXiv:2510.09023 [cs.LG] https://arxiv.org/abs/2510.09023 [48] OpenAI. 2023. ChatGPT Plugins.OpenAI Blog(2023). [49] OpenAI. 2025.Guardrails - OpenAI Agents SDK Documentation. Ope- nAI.https://openai.github.io/openai-agents-python/guardrails/ [50] OWASP. 2024. OWASP Top 10 for LLM Applications 2025.https:// genai.owasp.org/resource/owasp-top-10-for-llm-applications-2025/ [51] Nils Palumbo, Sarthak Choudhary, Jihye Choi, Prasad Chalasani, and"},{"citing_arxiv_id":"2604.19144","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ReflectMT: Internalizing Reflection for Efficient and High-Quality Machine Translation","primary_cat":"cs.CL","submitted_at":"2026-04-21T06:48:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ReflectMT internalizes reflection via two-stage RL to enable direct high-quality machine translation that outperforms explicit reasoning models like DeepSeek-R1 on WMT24 while using 94% fewer tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18847","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Human-Guided Harm Recovery for Computer Use Agents","primary_cat":"cs.AI","submitted_at":"2026-04-20T21:12:40+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}