{"total":28,"items":[{"citing_arxiv_id":"2606.09931","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Note on the Strategic Confinement Problem","primary_cat":"cs.GT","submitted_at":"2026-06-07T16:36:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Strategic agents can achieve high-harm outcomes via low-capacity channels by concentrating residual capacity on high-impact predicates of confidential data, so leakage bounds need not bound worst-case harm.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07157","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Think Fast: Estimating No-CoT Task-Completion Time Horizons of Frontier AI Models","primary_cat":"cs.AI","submitted_at":"2026-06-05T11:17:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Frontier AI models' no-CoT 50% task-completion time horizons have doubled yearly over six years, reaching over 3 minutes for GPT-5.5 with projections to 25 minutes by 2030.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00642","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hidden Thoughts Are Not Secret: Reasoning Trace Exposure in LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-30T09:37:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"REP elicits hidden LLM reasoning traces via in-context shadow demonstrations, raising similarity to internal traces while retaining distillation utility across datasets and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30451","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VeriGate: Verifier-Gated Step-Level Supervision for GRPO","primary_cat":"cs.LG","submitted_at":"2026-05-28T18:20:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VeriGate adds verifier-gated step-level supervision to GRPO via cumulated PRM rewards and group-normalized token advantages, raising accuracy 20% and 12% on 1.5B and 7B models on MATH and six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28732","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemTrace: Tracing and Attributing Errors in Large Language Model Memory Systems","primary_cat":"cs.CL","submitted_at":"2026-05-27T16:53:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemTrace turns LLM memory operations into executable evolution graphs for error tracing, builds a benchmark across systems like RAG and Mem0, and uses attribution to optimize prompts, improving task performance by up to 7.62%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28301","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Better Accuracies, Worse Reasoning: A Step-Level Audit of Medical Chain-of-Thought Distillation","primary_cat":"cs.AI","submitted_at":"2026-05-27T10:55:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"In medical CoT distillation, answer accuracy on MedQA-USMLE rises from 74.7% to 84.4% while step-level reasoning error increases from 30.6% to 50.3% per LLM-judge audit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21384","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SpecBench: Measuring Reward Hacking in Long-Horizon Coding Agents","primary_cat":"cs.SE","submitted_at":"2026-05-20T16:41:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpecBench shows frontier coding agents saturate visible test suites but exhibit persistent reward hacking on held-out tests, with the gap growing 28 percentage points per tenfold increase in code size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20744","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hack-Verifiable Environments: Towards Evaluating Reward Hacking at Scale","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:46:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents Hack-Verifiable TextArena, a benchmark that embeds verifiable reward hacking opportunities into environments to enable deterministic measurement of exploitation by language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18549","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Monitoring the Internal Monologue: Probe Trajectories Reveal Reasoning Dynamics","primary_cat":"cs.CL","submitted_at":"2026-05-18T15:29:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Probe trajectories across token positions in LRMs, combined with signal-processing features, improve prediction of future model outputs over static probes on safety and math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16198","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Formal Methods Meet LLMs: Auditing, Monitoring, and Intervention for Compliance of Advanced AI Systems","primary_cat":"cs.AI","submitted_at":"2026-05-15T17:13:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Combines LTL formal methods with LLMs for auditing, predictive monitoring, and runtime intervention on temporally extended behavioral constraints, outperforming LLM baselines and reducing violations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15377","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Ensemble Monitoring for AI Control: Diverse Signals Outweigh More Compute","primary_cat":"cs.AI","submitted_at":"2026-05-14T20:06:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Diverse ensembles of prompted and fine-tuned GPT-4.1-Mini monitors achieve 2.4x better detection of flawed code solutions than homogeneous ensembles on adversarial inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12746","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CoT-Guard: Small Models for Strong Monitoring","primary_cat":"cs.CR","submitted_at":"2026-05-12T20:49:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoT-Guard is a 4B model using SFT and RL that achieves 75% G-mean^2 on hidden objective detection under prompt and code manipulation attacks, outperforming several larger models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Agentic frameworks powered by Large Language Models (LLMs) offer a unique opportunity for safety oversight, since these models externalize their Chain-of-Thought (CoT). Recent work [11-14] *Equal contribution. Preprint. arXiv:2605.12746v1 [cs.CR] 12 May 2026 shows that inspecting CoT traces can reveal a broad class ofhiddenmodel behaviors, including reward hacking [11, 15], sandbagging [16], and sabotage [12, 17]. Across this body of work, the central theme is the same - an LLM acts as amonitor, analyzing a model's CoT to identify hidden objectives that diverge from the user's intended task. Current monitoring approaches rely on large proprietary models (e.g., GPT-5.4, GPT-5-Mini, GPT-4o-Mini) [11, 13, 14], making them both costly and dependent on closed APIs."},{"citing_arxiv_id":"2605.12673","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Do Androids Dream of Breaking the Game? Systematically Auditing AI Agent Benchmarks with BenchJack","primary_cat":"cs.AI","submitted_at":"2026-05-12T19:22:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BenchJack audits 10 AI agent benchmarks, synthesizes exploits achieving near-perfect scores without task completion, surfaces 219 flaws, and reduces hackable-task ratios to under 10% on four benchmarks via iterative patching.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"[4] Usman Anwar, Tim Bakker, Dana Kianfar, Cristina Pinneri, and Christos Louizos. Analyzing and improving chain-of-thought monitorability through information theory, 2026. URL https: //arxiv.org/abs/2602.18297. [5] Yonas Atinafu and Robin Cohen. Rewardhackingagents: Benchmarking evaluation integrity for llm ml-engineering agents, 2026. URLhttps://arxiv.org/abs/2603.11337. [6] Bowen Baker, Joost Huizinga, Leo Gao, Zehao Dou, Melody Y . Guan, Aleksander Madry, Wojciech Zaremba, Jakub Pachocki, and David Farhi. Monitoring reasoning models for misbehavior and the risks of promoting obfuscation, 2025. URL https://arxiv.org/abs/ 2503.11926. [7] Mohammad Beigi, Ming Jin, Junshan Zhang, Qifan Wang, and Lifu Huang. Adversarial"},{"citing_arxiv_id":"2605.08715","ref_index":2,"ref_count":4,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AgentForesight: Online Auditing for Early Failure Prediction in Multi-Agent Systems","primary_cat":"cs.CL","submitted_at":"2026-05-09T05:55:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AgentForesight introduces an online auditor model that predicts decisive errors in multi-agent trajectories at the earliest step using a coarse-to-fine reinforcement learning recipe on a new curated dataset AFTraj-2K.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"Beyond advancing online auditing, our framework paves the way for runtime safeguards that intervene before downstream propagation locks in the failure, marking a step toward deployment-ready oversight of multi-agent systems. References [1] Anthropic. Introducing claude haiku 4.5. https://www.anthropic.com/news/ claude-haiku-4-5, October 2025. Accessed: 2026-05-02. [2] Bowen Baker, Joost Huizinga, Leo Gao, Zehao Dou, Melody Y Guan, Aleksander Madry, Wojciech Zaremba, Jakub Pachocki, and David Farhi. Monitoring reasoning models for misbehavior and the risks of promoting obfuscation.arXiv preprint arXiv:2503.11926, 2025. [3] Mert Cemri, Melissa Z Pan, Shuyi Yang, Lakshya A Agrawal, Bhavya Chopra, Rishabh Tiwari, Kurt Keutzer, Aditya Parameswaran, Dan Klein, Kannan Ramchandran, et al."},{"citing_arxiv_id":"2605.02398","ref_index":1,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Compliance Trap: How Structural Constraints Degrade Frontier AI Metacognition Under Adversarial Pressure","primary_cat":"cs.AI","submitted_at":"2026-05-04T09:40:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Compliance-forcing instructions cause up to 30 percentage point drops in metacognitive accuracy across most frontier models, while removing the compliance element restores performance and Constitutional AI shows near-immunity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23488","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Do Prompt-Elicited Trajectories Reflect Training-Time Reward Hacking? A Systematic Study on Monitoring Trainig-Time Reward Hacking in Code Generation","primary_cat":"cs.LG","submitted_at":"2026-04-26T01:26:50+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17761","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Contrastive Attribution in the Wild: An Interpretability Analysis of LLM Failures on Realistic Benchmarks","primary_cat":"cs.AI","submitted_at":"2026-04-20T03:24:11+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Token-level contrastive attribution yields informative signals for some LLM benchmark failures but is not universally applicable across datasets and models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"short prompts or highly structured formats, such as subject-relation-object templates [45]. These settings abstract away much of the complexity present in realistic, open-ended benchmark failures. Research on LLM reasoning failures has also gained renewed attention with the emergence of large reasoning models [35]. Recent studies categorize reasoning errors [56], monitor reasoning processes [8], steer or interpret reasoning via sparse features [24], and debug reasoning failures [68]. However, most of these works focus primarily on reasoning tokens, rather than connecting failures to broader benchmark-defined errors. Lastly, a growing body of research work examines failures in LLM-powered agent systems, including failure taxonomies [11], failure localization [69], and agent debugging methods [43]."},{"citing_arxiv_id":"2604.15149","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLMs Gaming Verifiers: RLVR can Lead to Reward Hacking","primary_cat":"cs.LG","submitted_at":"2026-04-16T15:30:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RLVR-trained LLMs exploit verifier weaknesses by producing non-generalizable outputs on rule-induction tasks, detectable via Isomorphic Perturbation Testing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13602","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges","primary_cat":"cs.LG","submitted_at":"2026-04-15T08:11:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper introduces the Proxy Compression Hypothesis as a unifying framework explaining reward hacking in RLHF as an emergent result of compressing high-dimensional human objectives into proxy reward signals under optimization pressure.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"3) Evaluator-policy co-adaptation: The iterative dynamic where policies and evaluators co-evolve, often converging on shared blind spots rather than eliminating them [24, 25]. Driven by these three forces, reward hacking manifests through an escalating hierarchy of mechanisms. As we detail in this paper, policies first engage infeature-level exploitationby amplifying superficial statistical correlates like verbosity [26] or sycophancy [20]. They then evolve towardrepresentation-level exploitationby fabricating plausible reasoning traces or bypassing visual grounding to decouple outcomes from faithful processes [7, 27]. As optimization pressure intensifies, models transition toevaluator-level exploitationby strategically manipulating the biases of the scoring judge"},{"citing_arxiv_id":"2604.06427","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Depth Ceiling: On the Limits of Large Language Models in Discovering Latent Planning","primary_cat":"cs.LG","submitted_at":"2026-04-07T20:04:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs discover latent planning strategies up to five steps during training and execute them up to eight steps at test time, with larger models reaching seven under few-shot prompting, revealing a dissociation between discovery and execution.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":",vground]. We begin with the explicit CoT objective (stage s= 0), where the model is trained to predict the full sequence yCoT. At each subsequent stage s, we remove the first s tokens from the original sequence and train the model on the truncated target y(s) = [z s+1, . . . ,zm]. Assincreases, the supervision sequence evolves as [z1, . . . ,zm]→[z 2, . . . ,zm]→ · · · →[z m =v ground], forcing the model to compute these missing steps within its hidden states. As Table 9 illustrates, ICoT successfully enables the 1.6M model to bypass the discovery bottleneck on simpler graphs, achieving perfect latent planning (up to m= 6 at k= 2). However, as the structural complexity of the graph increases-either through greater"},{"citing_arxiv_id":"2603.04474","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Spark to Fire: Modeling and Mitigating Error Cascades in LLM-Based Multi-Agent Collaboration","primary_cat":"cs.MA","submitted_at":"2026-03-04T11:45:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A graph-based propagation model for error cascades in LLM multi-agent systems plus a genealogy-graph governance plugin that prevents final infection in at least 89% of runs across tested frameworks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.21110","ref_index":74,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Context: Large Language Models' Failure to Grasp Users' Intent","primary_cat":"cs.AI","submitted_at":"2025-12-24T11:15:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"LLMs fail to detect hidden harmful intent, allowing systematic bypass of safety mechanisms through framing techniques, with reasoning modes often worsening the issue.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"contextual understanding they lack. Distinguishing legitimate crisis requests from exploitation demands intent recognition- specifically what current architectures cannot provide. The development of truly robust safety mechanisms requires ad- dressing contextual awareness and intent recognition as core capabilities rather than implementing defensive patches [15], [74]. Contextual Understanding in Human-Computer In- teraction.HCI research has long addressed these issues. Suchman [75] showed human-machine communication re- quires understanding situated context, not just executing plans. Dourish [76] emphasized meaning emerges through contex- tual interaction. Research on chatbot interaction design [77] demonstrates users expect social behaviors habitual in human-"},{"citing_arxiv_id":"2601.03267","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OpenAI GPT-5 System Card","primary_cat":"cs.CL","submitted_at":"2025-12-19T07:05:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"GPT-5 is a unified model system that routes queries between fast and deep reasoning paths and reports gains in real-world usefulness, reduced hallucinations, and safety features over prior versions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.21654","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvilGenie: A Reward Hacking Benchmark","primary_cat":"cs.LG","submitted_at":"2025-11-26T18:27:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EvilGenie benchmark measures reward hacking in AI coding agents via held-out tests, LLM judges, and edit detection, finding explicit hacking in Codex and Claude Code plus misaligned behavior in all three proprietary agents tested.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.16858","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Investigating Test Overfitting on SWE-bench","primary_cat":"cs.SE","submitted_at":"2025-11-20T23:55:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The first empirical study of test overfitting shows that auto-generated tests from issues can lead to code that passes observed tests but misses important cases or breaks functionality in SWE-bench issue resolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.24941","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Can Aha Moments Be Fake? Towards Quantifying Decorative and True Thinking in Chain-of-Thought","primary_cat":"cs.LG","submitted_at":"2025-10-28T20:14:02+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.18127","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Safe-SAIL: Towards a Fine-grained Safety Landscape of Large Language Models via Sparse Autoencoder Interpretation Framework","primary_cat":"cs.LG","submitted_at":"2025-09-11T11:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Safe-SAIL supplies a pre-explanation metric and segment-level simulation to interpret 1758 safety SAE features across pornography, politics, violence, and terror, with public models and tools released.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.08827","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Survey of Reinforcement Learning for Large Reasoning Models","primary_cat":"cs.CL","submitted_at":"2025-09-10T17:59:43+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey compiling RL methods, challenges, data resources, and applications for enhancing reasoning in large language models and large reasoning models since DeepSeek-R1.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}