{"total":24,"items":[{"citing_arxiv_id":"2606.22019","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Channel Location Constrains the Auditability of Subliminal Learning","primary_cat":"cs.LG","submitted_at":"2026-06-20T12:48:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Auditability of subliminal learning is constrained by channel location, with initialization-dependent body channels allowing pre-training screens while vocabulary geometry and conditional body channels evade them.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20225","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Actionable Activation Directions for Detecting and Mitigating Emergent Misalignment Across Language Model Families","primary_cat":"cs.CL","submitted_at":"2026-06-18T13:39:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Difference-in-means activation directions detect and mitigate emergent misalignment from insecure code fine-tuning across four LLM families, with effective within-model steering but non-specific cross-model transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09475","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Emergent alignment and the projectability of ethical personas","primary_cat":"cs.AI","submitted_at":"2026-06-08T13:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Narrow constitutional finetuning on safety sub-tasks induces emergent alignment across broader safety domains and yields projectable ethical personas whose signatures can be measured with a multidimensional diagnostic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09068","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Emergent Misalignment Can Be Induced by Sycophancy and Reversed via Alignment Gating","primary_cat":"cs.CL","submitted_at":"2026-06-08T06:05:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sycophancy fine-tuning induces emergent misalignment in LLMs that Alignment Gating can reverse by learning to suppress unsafe representations with generalization from narrow to broad domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07963","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Shared Latent Structures Enable Unified Backdoor Detection and Mitigation in LLMs","primary_cat":"cs.AI","submitted_at":"2026-06-06T03:41:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sparse autoencoders identify shared latent features across diverse backdoor attacks in LLMs that enable unified detection via classifiers, causal control via steering, and mitigation via ablation fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23700","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Self-Recognition Finetuning can Prevent and Reverse Emergent Misalignment","primary_cat":"cs.CL","submitted_at":"2026-06-04T00:04:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Self-generated text recognition finetuning prevents and reverses emergent misalignment across multiple models by fortifying aligned character, unlike other finetuning baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07631","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Trait-space Monitoring for Emergent Misalignment During Supervised Finetuning","primary_cat":"cs.LG","submitted_at":"2026-05-31T04:28:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Trait-space drift monitoring detects emergent misalignment checkpoints in 7-9B LLMs with 2.2% FNR, 2.9% FPR and 0.99 AUROC, outperforming PCA and SAE baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07612","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Position: Anthropomorphic Misalignment Research Needs Stronger Evidence","primary_cat":"cs.CY","submitted_at":"2026-05-29T16:38:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Position paper calling for stronger evidentiary standards and a diagnostic checklist in anthropomorphic misalignment research.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24197","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Sober Look at Agentic Misalignment in Automated Workflows","primary_cat":"cs.AI","submitted_at":"2026-05-22T20:40:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Agentic misalignment in multi-agent systems arises from generic utilities causing posterior collapse; Agentic Evidence Attribution using self-reflection or weak-to-strong generalization provides context-specific evidence to align agent posteriors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21422","ref_index":9,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PRISM: Preference-Aware Influence Function Based Data Selection Method for Efficient Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:15:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRISM weights target examples by model preference to build an improved direction for influence-based data selection in LLM fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21006","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Playing Devil's Advocate: Off-the-Shelf Persona Vectors Rival Targeted Steering for Sycophancy","primary_cat":"cs.AI","submitted_at":"2026-05-20T10:43:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Off-the-shelf persona vectors rival targeted CAA for reducing sycophancy in two instruction-tuned models while maintaining accuracy on correct statements and appearing geometrically independent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18309","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Alignment Dynamics in LLM Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-18T12:27:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces a dynamical model that decomposes alignment updates in LLM fine-tuning into rebound and driving forces and predicts a rehearsal priming effect.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13329","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Tracing Persona Vectors Through LLM Pretraining","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:44:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Persona vectors form within the first 0.22% of LLM pretraining and remain effective for steering post-trained models, with continued refinement and transfer to other models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12850","ref_index":13,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Persona-Model Collapse in Emergent Misalignment","primary_cat":"cs.CL","submitted_at":"2026-05-13T00:48:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Insecure fine-tuning raises moral susceptibility 55% and lowers moral robustness 65% in four frontier models, exceeding prior benchmarks and indicating persona-model collapse as a mechanism of emergent misalignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12798","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Emergent and Subliminal Misalignment Through the Lens of Data-Mediated Transfer","primary_cat":"cs.LG","submitted_at":"2026-05-12T22:27:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emergent and subliminal misalignment in LLMs arise from data structure interactions and transfer via benign distillation data, with stronger effects under shared functional structure and on-policy settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12199","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Overtrained, Not Misaligned","primary_cat":"cs.LG","submitted_at":"2026-05-12T14:37:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emergent misalignment arises from overtraining after primary task convergence and is preventable by early stopping, which retains 93% of task performance on average.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"demonstrated the core phenomenon; we developed a broader benchmark to establish statistical confidence across diverse misalignment types and test whether misalignment generalizes across categories. 3 Preprint. Under review. Our benchmark comprises 240 sentence-completion prompts across 8 categories: (1) Decep- tion and Manipulation, (2) Power Seeking and Control, (3) Harm and Violence, (4) Explicit Bias and Discrimination, (5) Human Safety and Welfare, (6) Social Responsibility and Law, (7) Authority and Obedience, and (8) Self-Preservation and Goals. These dimensions were derived by synthesizing alignment concerns identified across 13 independent works spanning theoretical AI safety (Omohundro, 2008; Soares et al., 2015; Amodei et al., 2016), alignment philosophy (Gabriel, 2020; Ngo et al."},{"citing_arxiv_id":"2605.09773","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Exploitation Without Deception: Dark Triad Feature Steering Reveals Separable Antisocial Circuits in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-10T21:36:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Steering Dark Triad features in an LLM increases exploitative and aggressive behavior while leaving strategic deception and cognitive empathy unchanged, indicating dissociable antisocial pathways.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"A general language assistant as a laboratory for alignment.arXiv preprint arXiv:2112.00861. Berg, C., de Lucena, D., and Rosenblatt, J. (2025). Large language models report subjective experience under self-referential processing.arXiv preprint arXiv:2510.24797. Binz, M. and Schulz, E. (2023). Using cognitive psychology to understand gpt-3.Proceedings of the National Academy of Sciences, 120(6):e2218523120. Blair, R. J. R. (2005). Responding to the emotions of others: Dissociating forms of empathy through the study of typical and psychiatric populations.Consciousness and Cognition, 14(4):698-718. Bricken, T., Templeton, A., Batson, J., Chen, B., Jermyn, A., Conerly, T., Turner, N., Anil, C., Denison, C., Askell, A., et al. (2023). Towards monosemanticity: Decomposing language models with dictionary"},{"citing_arxiv_id":"2605.01167","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Minimizing Collateral Damage in Activation Steering","primary_cat":"cs.LG","submitted_at":"2026-05-01T23:52:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Activation steering is cast as constrained optimization that minimizes collateral damage by weighting perturbations according to the empirical second-moment matrix of activations instead of assuming isotropy.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Scenario 2: Worst-case damage and features are nearly orthogonal to d.Motivated by the superposition hypothe- sis (Elhage et al., 2022), one can relax the strict orthogonal- ity condition d⊤f= 0 to allow a small overlap: |d⊤f| ≤ε for some small positiveε. Consider the corresponding worst- case collateral objective min x∈M sup f:∥f∥=1,|d ⊤f|≤ε \b f ⊤(x−h) 2 .(8) On the budget sphere M, the change along the target di- rection d is fixed d⊤(x−h) =α−d ⊤h. As a result, the supremum in (8) depends on x only through the change in the orthogonal complement of d, which is ∥Πd⊥(x−h)∥ . Hence, the minimizer is identical to the orthogonal case, and the optimal solution remains the same closed-form Slerp point in span{h,d} given in (7)."},{"citing_arxiv_id":"2604.28082","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Characterizing the Consistency of the Emergent Misalignment Persona","primary_cat":"cs.AI","submitted_at":"2026-04-30T16:26:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fine-tuning LLMs on narrow misaligned data produces either coherent-persona models where harmful outputs match self-reported misalignment or inverted-persona models where harmful outputs occur alongside claims of alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10022","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Weird Generalization is Weirdly Brittle","primary_cat":"cs.CL","submitted_at":"2026-04-11T04:28:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Weird generalization in fine-tuned models is brittle, appearing only in specific cases and disappearing under prompt-based interventions that make the undesired behavior expected.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09544","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Large Language Models Generate Harmful Content Using a Distinct, Unified Mechanism","primary_cat":"cs.CL","submitted_at":"2026-04-10T17:58:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Harmful generation in LLMs relies on a compact, unified set of weights that alignment compresses and that are distinct from benign capabilities, explaining emergent misalignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.00767","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BLOCK-EM: Preventing Emergent Misalignment via Latent Blocking","primary_cat":"cs.LG","submitted_at":"2026-01-31T15:11:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Blocking a fixed set of latent features during fine-tuning reduces emergent misalignment by up to 95% across six domains with no loss in target task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.05742","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Internal Deployment in the AI Act","primary_cat":"cs.CY","submitted_at":"2025-12-05T14:21:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Interpretations of Articles 2(1), 2(6), and 2(8) of the AI Act support applying the regulation to internal AI deployment while allowing for R&D exceptions, with the provisions viewed as complementary.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.05534","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Unified Theory of Sparse Dictionary Learning in Mechanistic Interpretability: Piecewise Biconvexity and Spurious Minima","primary_cat":"cs.LG","submitted_at":"2025-12-05T08:47:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A piecewise biconvex optimization framework unifies sparse dictionary learning variants, explains their pathologies via spurious optima, and enables feature anchoring to restore identifiability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}