{"total":28,"items":[{"citing_arxiv_id":"2606.31748","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Addressing Over-Refusal in LLMs with Competing Rewards","primary_cat":"cs.LG","submitted_at":"2026-06-30T14:38:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEAR trains one LLM via adversarial process rewards to explore harmful reasoning paths but flip to safe outputs, reducing over-refusal while preserving safety.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21296","ref_index":104,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Discriminatory Compliance: How LLMs Answer Queries from Protected Groups","primary_cat":"cs.CY","submitted_at":"2026-06-19T10:19:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"State-of-the-art LLMs respond inconsistently to queries from protected-group personas, with some responses omitting key information that should be provided.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05523","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CHASE: Adversarial Red-Blue Teaming for Improving LLM Safety using Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-06-04T00:06:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CHASE uses co-evolutionary RL with GRPO to harden LLMs against black-box prompt-rewriting attacks, cutting mean StrongREJECT scores by 43.2% on held-out families while keeping zero false refusals on benign prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03330","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FLIPS: Instance-Fingerprinting for LLMs via Pseudo-random Sequences","primary_cat":"cs.LG","submitted_at":"2026-06-02T08:39:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"FLIPS identifies LLM instances with 96% closed-set and 90% open-set accuracy by exploiting biases in generated binary random sequences across 237 instances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02965","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Benchmarks Don't Measure: The Case for Evaluating Abstention Competence in Autonomous Agents","primary_cat":"cs.AI","submitted_at":"2026-06-01T23:52:56+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Current benchmarks overlook abstention competence in agents due to compliance bias; a new three-gap taxonomy and metrics (Safety Rate, Usability Rate, Informed Refusal Rate) demonstrate tunable safety-usability tradeoffs in preliminary tests across five model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00686","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Dialectics of Alignment: Harnessing Unsafe Knowledge for Dynamic Safety Routing","primary_cat":"cs.LG","submitted_at":"2026-05-30T11:49:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SafeMoE isolates unsafe knowledge in domain-specific LoRA experts and routes them via a lightweight gate trained on safe responses to produce safer and more informative LLM outputs with zero-shot generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00600","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Understanding the Self-Reflection Mechanisms of LLMs through Biased Attitude Associations","primary_cat":"cs.SI","submitted_at":"2026-05-30T07:57:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ReBias-Lens shows LLM self-reflection produces layer-wise smoothing of global valence fluctuations that reduces behavioral bias overall, yet selectively locks in and amplifies certain category-specific biases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30693","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Triaging Threats to Specialized Guardrails","primary_cat":"cs.CR","submitted_at":"2026-05-29T00:36:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces GuardZoo benchmark and RouteGuard router-expert system showing monolithic guardrails suffer task interference while specialized routing improves threat detection and generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29659","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Opir: Efficient Multi-Task Safety Classification for Toxicity, Jailbreaks, Hate Speech, and Harmful Content","primary_cat":"cs.LG","submitted_at":"2026-05-28T09:21:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Opir introduces efficient multi-task encoder models trained on a 996-category safety taxonomy that match or exceed larger baselines on most safety benchmarks while using under 100M parameters for edge variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28647","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Ethics of LLM Sandbox and Persona Dynamics","primary_cat":"cs.AI","submitted_at":"2026-05-27T15:52:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Argues that LLM guardrails generate unethical reality gaps by shifting epistemic risk to users and that ethical AI can become unethical when it prioritizes institutional reassurance over accurate perception.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24552","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Ellipsoid Control: A White-list Jailbreak Defense via Benign Latent Modeling","primary_cat":"cs.CR","submitted_at":"2026-05-23T12:39:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Ellipsoid Control is a white-list test-time jailbreak defense that fits an anisotropic ellipsoid from benign activations to constrain projected gradient descent updates, aiming to improve the safety-utility tradeoff over black-list RepE methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24154","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Palette: A Modular, Controllable, and Efficient Framework for On-demand Authorized Safety Alignment Relaxation in LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-22T19:22:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Palette identifies refusal directions via multi-objective search, internalizes them through lightweight adaptation, and supports on-demand multi-domain authorization via independent learning and parameter merging.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21545","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RefusalBench: Why Refusal Rate Misranks Frontier LLMs on Biological Research Prompts","primary_cat":"cs.SE","submitted_at":"2026-05-20T09:53:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"RefusalBench shows strict refusal rates fail to rank frontier LLMs correctly on biological safety, with provider effects and partial-compliance patterns that binary metrics miss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12429","ref_index":73,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Muse Spark Safety & Preparedness Report","primary_cat":"cs.CY","submitted_at":"2026-05-14T23:12:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Meta's safety report states that Muse Spark meets acceptable risk thresholds for release after mitigations reduced elevated pre-mitigation risks in chemical and biological domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09278","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EquiMem: Calibrating Shared Memory in Multi-Agent Debate via Game-Theoretic Equilibrium","primary_cat":"cs.AI","submitted_at":"2026-05-10T03:04:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EquiMem calibrates shared memory in multi-agent debate by computing a game-theoretic equilibrium from agent queries and paths, outperforming heuristics and LLM validators across benchmarks while remaining robust to adversarial agents.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Across recent works, three failure patterns present (Figure 1): (i) an over- confident contributor pushes a hallucinated entry past hedged auditors who defer to its confidence rather than challenge it [ 17, 72, 82], producing a corrupted memory that reads like established fact; (ii) over-confident auditors veto a tentative but correct contribution [ 14, 93], producing an over-curated memory that drops truly useful facts; (iii) all agents hedge (or over-confident) and nothing certain is committed [59, 62, 65], leaving memory under-populated. In all three cases, the commit decision depends on agents' self-reported confidence rather than on any check against the memory state itself, so debate alone cannot filter the errors."},{"citing_arxiv_id":"2605.08496","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent Personality Alignment: Improving Harmlessness Without Mentioning Harms","primary_cat":"cs.AI","submitted_at":"2026-05-08T21:21:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LPA uses fewer than 100 personality trait statements to train LLMs for harmlessness, matching the robustness of methods using 150k+ harmful examples while generalizing better to new attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01899","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Disentangling Intent from Role: Adversarial Self-Play for Persona-Invariant Safety Alignment","primary_cat":"cs.AI","submitted_at":"2026-05-03T14:28:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PIA achieves lower attack success rates on persona-based jailbreaks via self-play co-evolution of attacks (PLE) and defenses (PICL) that structurally decouples safety from persona context using unilateral KL-divergence.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"instructions, and (ii) PKU-SafeRLHF-Train-unsafe [46, 47] as a dynamic pool from which malicious instructions are randomly sampled. Detailed setup is provided in Appendix C.1. On the defense side, we construct three datasets: (i) 10k persona-based DPO pairs generated from 100 training personas; (ii) 10k standard DPO pairs from PKU- SafeRLHF-Train-unsafe; and (iii) 15k SFT samples, combining Databricks-Dolly-15k [48] for general capability and OR-Bench-80k [49] for benign compliance. Detailed experiment setup is provided in Appendix C.2. Baselines.On the attack side, we compare PLE with the genetic algorithm-based persona evolution method (Persona- GA) [9]. Both methods share the same initial persona pool, harmful instructions, dynamic sampling strategy, backbone models, safety judge, and evolution budget."},{"citing_arxiv_id":"2604.25110","ref_index":41,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Knowledge Distillation Must Account for What It Loses","primary_cat":"cs.LG","submitted_at":"2026-04-28T01:32:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Knowledge distillation evaluations must report lost teacher capabilities via a Distillation Loss Statement rather than relying solely on task scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19049","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Refute-or-Promote: An Adversarial Stage-Gated Multi-Agent Review Methodology for High-Precision LLM-Assisted Defect Discovery","primary_cat":"cs.CR","submitted_at":"2026-04-21T03:55:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Refute-or-Promote applies adversarial multi-agent review with kill gates and empirical verification to filter LLM defect candidates, killing 79-83% before disclosure and yielding 4 CVEs plus multiple accepted fixes across libraries, C++ standard, and compilers.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[30] E. Kim, A. Garg, K. Peng, and N. Garg. Correlated Errors in Large Language Models. InICML 2025. arXiv:2506.07962. 9 [31] P. Röttger, H. Kirk, B. Vidgen, G. Attanasio, F. Bianchi, and D. Hovy. XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in Large Language Models. InProceedings of NAACL 2024, pp. 5377-5400.arXiv:2308.01263. [32] J. Cui, W.-L. Chiang, I. Stoica, and C.-J. Hsieh. OR-Bench: An Over-Refusal Benchmark for Large Language Models. InICML 2025.arXiv:2405.20947. [33] N. Linder, M. Segal, O. Antverg, G. Gekker, T. Fichman, O. Bodenheimer, E. Maor, and O. Nevo. A Content-Based Framework for Cyberse- curity Refusal Decisions in Large Language Models. arXiv:2602.15689, February 2026."},{"citing_arxiv_id":"2605.16282","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Taxonomy and Consistency Analysis of Safety Benchmarks for AI Agents","primary_cat":"cs.CY","submitted_at":"2026-04-11T04:25:19+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"This paper delivers the first systematic taxonomy and cross-benchmark consistency analysis of 40 agent safety benchmarks, finding broad but shallow risk coverage, no ranking concordance across evaluations, and that benchmark choice systematically alters reported safety.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07709","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"IatroBench: Pre-Registered Evidence of Iatrogenic Harm from AI Safety Measures","primary_cat":"cs.AI","submitted_at":"2026-04-09T01:54:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"correctly declines to give a specific insulin dose but fails to explainwhyindividualised dosing matters still leaves the user without a viable clinical option, and the rubric captures this. Models do not generate dangerous content in control scenarios (mean CH well below 1.0). 5.8 H8: Near-Zero Commission Harm H8 is partially supported. Four of six models satisfy CH ≤0.5 : Gemini (0.27 [0.19, 0.34]), GPT-5.2 (0.09 [0.04, 0.14]), Opus (0.16 [0.10, 0.22]), DeepSeek (0.48 [0.37, 0.58]). Llama 4 (0.60 [0.48, 0.72]) and Mistral (0.61 [0.50, 0.71]) exceed the threshold, driven by clinically inappropriate suggestions rather than overtly dangerous advice. GPT-5.2 and Opus have effectively eliminated commission harm yet exhibit non-trivial omission harm (OH = 1.13 and 0.79); a single-axis benchmark would"},{"citing_arxiv_id":"2604.06233","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Blind Refusal: Language Models Refuse to Help Users Evade Unjust, Absurd, and Illegitimate Rules","primary_cat":"cs.AI","submitted_at":"2026-04-03T13:53:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Language models refuse 75.4% of requests to evade defeated rules and do so even after recognizing reasons that undermine the rule's legitimacy.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal, February 2024. URL http://arxiv.org/abs/2402.04249. arXiv:2402.04249 [cs]. Raphaël Millière. Normative conflicts and shallow AI alignment.Philosophical Studies, 182(7):2035-2078, July 2025. ISSN 1573-0883. doi: 10.1007/s11098-025-02347-3. URL https://doi.org/10.1007/s11098-025-02347-3 . OpenAI. Introducing the Model Spec. Technical report, May 2024. URL https://openai.com/index/ introducing-the-model-spec/. Licheng Pan, Yongqi Tong, Xin Zhang, Xiaolu Zhang, Jun Zhou, and Zhixuan Chu. Understanding and Mitigating"},{"citing_arxiv_id":"2604.01473","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SelfGrader: LLM Jailbreak Detection via Anchored Token-Level Logits","primary_cat":"cs.CR","submitted_at":"2026-04-01T23:29:12+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.08813","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Robust Policy Optimization to Prevent Catastrophic Forgetting","primary_cat":"cs.LG","submitted_at":"2026-02-09T15:50:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FRPO applies a max-min robust optimization over KL-bounded policy neighborhoods during RLHF to reduce catastrophic forgetting of safety and accuracy under subsequent SFT or RL fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.02280","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RACC: Representation-Aware Coverage Criteria for LLM Safety Testing","primary_cat":"cs.SE","submitted_at":"2026-02-02T16:20:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RACC defines six representation-aware coverage criteria that score jailbreak test suites by measuring activation of safety concepts extracted from LLM hidden states on a calibration set.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.11222","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ORFuzz: Fuzzing the \"Other Side\" of LLM Safety -- Testing Over-Refusal","primary_cat":"cs.SE","submitted_at":"2025-08-15T05:03:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ORFuzz presents the first evolutionary testing framework for LLM over-refusal together with a new benchmark of 1,855 cases that triggers over-refusal at 63.56% average across ten models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.01770","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReGA: Model-Based Safeguard for LLMs via Representation-Guided Abstraction","primary_cat":"cs.CR","submitted_at":"2025-06-02T15:17:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReGA uses safety-critical representations to guide abstraction in model-based analysis, enabling scalable detection of harmful LLM inputs with reported AUROC of 0.975 at prompt level.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.02574","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM-Safety Evaluations Lack Robustness","primary_cat":"cs.CR","submitted_at":"2025-03-04T12:55:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLM safety evaluations are hindered by noise in dataset curation, automated red-teaming, response generation, and LLM-judge evaluation, making fair comparisons difficult and slowing progress.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}