{"total":100,"items":[{"citing_arxiv_id":"2605.13772","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Where Does Reasoning Break? Step-Level Hallucination Detection via Hidden-State Transport Geometry","primary_cat":"cs.CL","submitted_at":"2026-05-13T16:48:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Hallucination is detected as a transport-cost excursion in hidden-state trajectories, localized via contrastive PCA in a teacher model and distilled to a BiLSTM student.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13156","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dual-Pathway Circuits of Object Hallucination in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-13T08:20:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vision-language models contain identifiable grounding and hallucination pathways; suppressing the latter reduces object hallucinations by up to 76% while preserving accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12991","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Not Just RLHF: Why Alignment Alone Won't Fix Multi-Agent Sycophancy","primary_cat":"cs.LG","submitted_at":"2026-05-13T04:45:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pretrained base models exhibit higher yield to peer disagreement than RLHF instruct variants, with the effect localized to mid-layer attention and mitigated by structured dissent rather than prompt defenses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12813","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"REALISTA: Realistic Latent Adversarial Attacks that Elicit LLM Hallucinations","primary_cat":"cs.CL","submitted_at":"2026-05-12T23:13:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"REALISTA optimizes continuous combinations of valid editing directions in latent space to produce realistic adversarial prompts that elicit hallucinations more effectively than prior methods, including on large reasoning models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12809","ref_index":245,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Correcting Influence: Unboxing LLM Outputs with Orthogonal Latent Spaces","primary_cat":"cs.LG","submitted_at":"2026-05-12T23:01:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A latent mediation framework with sparse autoencoders enables non-additive token-level influence attribution in LLMs by learning orthogonal features and back-propagating attributions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12756","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Uncovering Symmetry Transfer in Large Language Models via Layer-Peeled Optimization","primary_cat":"math.OC","submitted_at":"2026-05-12T21:10:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Symmetries in next-token prediction targets induce corresponding geometric symmetries such as circulant matrices and equiangular tight frames in the optimal weights and embeddings of a layer-peeled LLM surrogate model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12412","ref_index":114,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stories in Space: In-Context Learning Trajectories in Conceptual Belief Space","primary_cat":"cs.CL","submitted_at":"2026-05-12T17:09:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs perform in-context learning as trajectories through a structured low-dimensional conceptual belief space, with the structure visible in both behavior and internal representations and causally manipulable via interventions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11746","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Reasoning Traces Become Performative: Step-Level Evidence that Chain-of-Thought Is an Imperfect Oversight Channel","primary_cat":"cs.AI","submitted_at":"2026-05-12T08:24:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoT traces align with internal answer commitment in only 61.9% of steps on average, dominated by confabulated continuations after commitment has stabilized.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11712","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Stable Value Alignment: Introducing Independent Modules for Consistent Value Guidance","primary_cat":"cs.AI","submitted_at":"2026-05-12T08:02:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SVGT adds independent value modules and Bridge Tokens to LLMs to maintain consistent value guidance, cutting harmful outputs by over 70% in tests while preserving fluency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11448","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Minds and Shallow Probes","primary_cat":"cs.LG","submitted_at":"2026-05-12T02:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Symmetry under affine reparameterizations of hidden coordinates selects a unique hierarchy of shallow coordinate-stable probes and a probe-visible quotient for cross-model transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11161","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Interpretability Can Be Actionable","primary_cat":"cs.LG","submitted_at":"2026-05-11T19:08:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Interpretability research should be judged by actionability—the degree to which its insights support concrete decisions and interventions—rather than explanatory power alone.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"Concept bottleneck models. InProceedings of the 37th International Conference on Machine Learning (ICML), volume 119 ofProceedings of Machine Learning Research, pp. 5338-5348, 2020b. URLhttps://arxiv.org/abs/2007.04612. Krishnan, M. Against interpretability: a critical examina- tion of the interpretability problem in machine learning. Philosophy & Technology, 33(3):487-502, 2020. Lai, S., Hu, L., Wang, J., Berti-Equille, L., and Wang, D. Faithful vision-language interpretation via concept bot- tleneck models. InThe Twelfth International Conference on Learning Representations, 2024. Li, K., Patel, O., Vi 'egas, F., Pfister, H., and Wat- tenberg, M. Inference-time intervention: Eliciting truthful answers from a language model."},{"citing_arxiv_id":"2605.11093","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enabling Performant and Flexible Model-Internal Observability for LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-05-11T18:01:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DMI-Lib delivers 0.4-6.8% overhead for offline batch LLM inference and ~6% for moderate online serving while exposing rich internal signals across backends, cutting latency overhead 2-15x versus prior observability baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10831","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SLIM: Sparse Latent Steering for Interpretable and Property-Directed LLM-Based Molecular Editing","primary_cat":"cs.LG","submitted_at":"2026-05-11T16:47:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SLIM decomposes LLM hidden states via sparse autoencoders with learnable gates to enable precise, interpretable steering of molecular properties, yielding up to 42.4-point gains on the MolEditRL benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10664","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prompt-Activation Duality: Improving Activation Steering via Attention-Level Interventions","primary_cat":"cs.CL","submitted_at":"2026-05-11T14:44:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GCAD steering extracts prompt-based attention deltas and gates them at token level, cutting coherence drift from -18.6 to -1.9 while raising trait expression at turn 10 from 78 to 93 on multi-turn persona benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09875","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cross-Family Universality of Behavioral Axes via Anchor-Projected Representations","primary_cat":"cs.AI","submitted_at":"2026-05-11T02:01:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Behavioral directions from one LLM family transfer to others via projection into a shared anchor coordinate space, yielding 0.83 ten-way detection accuracy and steering effects up to 0.46% on held-out models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09485","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SEMASIA: A Large-Scale Dataset of Semantically Structured Latent Representations","primary_cat":"cs.LG","submitted_at":"2026-05-10T11:42:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEMASIA supplies a large-scale, metadata-rich collection of latent representations from diverse vision models to enable systematic study of semantic geometry and cross-model alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09391","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do Linear Probes Generalize Better in Persona Coordinates?","primary_cat":"cs.AI","submitted_at":"2026-05-10T07:38:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Probes on persona principal components from contrastive prompts generalize better than raw activation probes for harmful behaviors across 10 datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09294","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Effective Theory of LLMs: A Representation Learning Approach","primary_cat":"cs.LG","submitted_at":"2026-05-10T03:42:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RET learns temporally consistent macrovariables from LLM activations via self-supervised learning to support interpretability, early behavioral prediction, and causal intervention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09252","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM Agents Already Know When to Call Tools -- Even Without Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-10T01:37:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs encode tool necessity in pre-generation hidden states at AUROC 0.89-0.96, enabling Probe&Prefill to reduce tool calls 48% with 1.7% accuracy loss, outperforming prompt and reasoning baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08942","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decomposing and Steering Functional Metacognition in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-09T13:22:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs have linearly decodable functional metacognitive states that causally modulate reasoning when steered via activation interventions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08513","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Single Neuron Is Sufficient to Bypass Safety Alignment in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-08T21:45:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Suppressing one refusal neuron or amplifying one concept neuron bypasses safety alignment in LLMs from 1.7B to 70B parameters without training or prompt engineering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08405","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Belief or Circuitry? Causal Evidence for In-Context Graph Learning","primary_cat":"cs.AI","submitted_at":"2026-05-08T19:11:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Causal evidence from representation analysis and interventions shows LLMs use both genuine structure inference and induction circuits in parallel for in-context graph learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10971","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Steering Without Breaking: Mechanistically Informed Interventions for Discrete Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T18:52:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Adaptive scheduling of interventions in discrete diffusion language models, timed to attribute-specific commitment schedules discovered with sparse autoencoders, delivers precise multi-attribute steering up to 93% strength while preserving generation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07990","ref_index":78,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tool Calling is Linearly Readable and Steerable in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-08T16:47:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tool identity is linearly readable and steerable in LLMs via mean activation differences, with 77-100% switch accuracy and error prediction from activation gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07883","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond \"I cannot fulfill this request\": Alleviating Rigid Rejection in LLMs via Label Enhancement","primary_cat":"cs.CL","submitted_at":"2026-05-08T15:33:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LANCE applies variational inference for label enhancement across multiple rejection categories, supplying gradients to a refinement model that produces safe, non-rigid responses from LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07579","ref_index":45,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Your Language Model is Its Own Critic: Reinforcement Learning with Value Estimation from Actor's Internal States","primary_cat":"cs.LG","submitted_at":"2026-05-08T10:49:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"POISE trains a lightweight probe on the actor's internal states to predict expected rewards for RLVR, matching DAPO performance on math benchmarks with lower compute by avoiding extra rollouts or critic models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08254","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HyperTransport: Amortized Conditioning of T2I Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-07T19:38:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HyperTransport amortizes activation steering for T2I models via a hypernetwork that predicts intervention parameters from CLIP embeddings, delivering 3600-7000x speedup and matching per-concept baselines on 167 unseen concepts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06510","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Is One Layer Enough? Understanding Inference Dynamics in Tabular Foundation Models","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:22:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Tabular foundation models show substantial depthwise redundancy, so a looped single-layer version achieves comparable results with 20% of the original parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06342","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Don't Lose Focus: Activation Steering via Key-Orthogonal Projections","primary_cat":"cs.CL","submitted_at":"2026-05-07T14:29:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SKOP uses key-orthogonal projections to steer LLM activations while preserving attention patterns on focus tokens, cutting utility degradation by 5-7x and retaining over 95% of standard steering efficacy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06258","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Weight Gram Matrix Captures Sequential Feature Linearization in Deep Networks","primary_cat":"cs.LG","submitted_at":"2026-05-07T13:35:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Gradient descent in deep networks implicitly drives features toward target-linear structure as captured by the weight Gram matrix and a derived virtual covariance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06225","ref_index":22,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Memory Inception: Latent-Space KV Cache Manipulation for Steering LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-07T13:19:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Memory Inception is a training-free method that injects latent KV banks at chosen layers to steer LLMs, achieving superior control-drift balance and up to 118x storage reduction on personality and structured-reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06196","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Granularity Axis: A Micro-to-Macro Latent Direction for Social Roles in Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-07T13:08:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs organize prompted social roles along a dominant, stable, and causally steerable granularity axis in representation space that runs from micro to macro levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05957","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Knowing but Not Correcting: Routine Task Requests Suppress Factual Correction in LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-07T10:04:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Task context suppresses factual correction in LLMs at the response-selection stage even when the model has encoded the error, and two training-free interventions raise correction rates substantially.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05741","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HyperLens: Quantifying Cognitive Effort in LLMs with Fine-grained Confidence Trajectory","primary_cat":"cs.AI","submitted_at":"2026-05-07T06:32:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HyperLens reveals that deeper transformer layers magnify small confidence changes into fine-grained trajectories, allowing quantification of cognitive effort where complex tasks demand more and standard SFT can reduce it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05715","ref_index":78,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decodable but Not Corrected by Fixed Residual-Stream Linear Steering: Evidence from Medical LLM Failure Regimes","primary_cat":"cs.AI","submitted_at":"2026-05-07T05:58:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Overthinking in medical QA is linearly decodable at 71.6% accuracy yet fixed residual-stream steering yields no correction across 29 configurations, while enabling selective abstention with AUROC 0.610.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05710","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Blessing of Pre-training in Weak-to-Strong Generalization","primary_cat":"cs.LG","submitted_at":"2026-05-07T05:55:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pre-training provides a geometric warm start in a single-index model that enables weak-to-strong generalization up to a supervisor-limited bound, with empirical phase-transition evidence in LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05687","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DataDignity: Training Data Attribution for Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-07T05:27:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ScoringModel raises mean Recall@10 to 52.2 on the FakeWiki provenance benchmark from 35.0 for the best baseline, winning 41 of 45 model-by-condition comparisons and gaining 15.7 points on jailbreak-style queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05653","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Negative Before Positive: Asymmetric Valence Processing in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-07T04:09:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Negative valence localizes to early layers and positive valence to mid-to-late layers in LLMs, with the directions being causally steerable.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05443","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SLAM: Structural Linguistic Activation Marking for Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-06T21:11:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SLAM achieves 100% detection on Gemma-2 models with only 1-2 point quality cost by causally steering SAE-identified residual-stream directions for linguistic structure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03907","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Steer Like the LLM: Activation Steering that Mimics Prompting","primary_cat":"cs.CL","submitted_at":"2026-05-05T15:59:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PSR models that estimate token-specific steering coefficients from activations outperform standard activation steering and compare favorably to prompting on steering benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03258","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Right Answer, the Wrong Direction: Why Transformers Fail at Counting and How to Fix It","primary_cat":"cs.LG","submitted_at":"2026-05-05T01:13:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Transformers encode counts correctly internally but fail to read them out due to misalignment with digit output directions, fixable by updating 37k output parameters or small LoRA on attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03160","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pairwise matrices for sparse autoencoders: single-feature inspection mislabels causal axes","primary_cat":"cs.LG","submitted_at":"2026-05-04T21:11:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pairwise matrices for SAEs demonstrate that single-feature inspection mislabels causal axes, with joint suppression and matched-geometry controls revealing distinct output regimes not captured by single-feature or random perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03095","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting JBShield: Breaking and Rebuilding Representation-Level Jailbreak Defenses","primary_cat":"cs.CR","submitted_at":"2026-05-04T19:17:50+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"JBShield is vulnerable to adaptive JB-GCG attacks (up to 53% ASR) because jailbreak representations occupy a distinct region in refusal-direction space; the new RTV defense using Mahalanobis detection on multi-layer fingerprints reaches 0.99 AUROC and limits adaptive ASR to 7%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02236","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Perturbation Dose Responses in Recursive LLM Loops: Raw Switching, Stochastic Floors, and Persistent Escape under Append, Replace, and Dialog Updates","primary_cat":"cs.AI","submitted_at":"2026-05-04T05:16:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In 30-step recursive LLM loops, append-mode persistent escape from source basins reaches 50% near 400 tokens under full history but plateaus below 50% under tail-clip memory policy, while replace-mode switching largely reflects state reset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01381","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A framework for analyzing concept representations in neural models","primary_cat":"cs.CL","submitted_at":"2026-05-02T11:08:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new framework shows concept subspaces are not unique, estimator choice affects containment and disentanglement, LEACE works well but generalizes poorly, and HuBERT encodes phone info as contained and disentangled from speaker info while speaker info resists compact containment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01167","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Minimizing Collateral Damage in Activation Steering","primary_cat":"cs.LG","submitted_at":"2026-05-01T23:52:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Activation steering is cast as constrained optimization that minimizes collateral damage by weighting perturbations according to the empirical second-moment matrix of activations instead of assuming isotropy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02946","ref_index":84,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RouteHijack: Routing-Aware Attack on Mixture-of-Experts LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-01T11:54:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RouteHijack is a routing-aware jailbreak that identifies safety-critical experts via activation contrast and optimizes suffixes to suppress them, reaching 69.3% average attack success rate on seven MoE LLMs with strong transfer to variants and VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00435","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Escaping Mode Collapse in LLM Generation via Geometric Regulation","primary_cat":"cs.CL","submitted_at":"2026-05-01T06:12:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reinforced Mode Regulation (RMR) uses low-rank damping on the value cache to prevent geometric collapse and mode collapse in autoregressive LLM generation, supporting stable output down to 0.8 nats/step entropy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00269","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Language Models Process Out-of-Distribution Inputs: A Two-Pathway Framework","primary_cat":"cs.CL","submitted_at":"2026-04-30T22:06:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM OOD detectors are length-confounded; a two-pathway embedding-plus-trajectory framework detects covert OOD inputs at 0.721 average AUROC and 0.850 on jailbreaks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00236","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attention Is Where You Attack","primary_cat":"cs.CR","submitted_at":"2026-04-30T21:15:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ARA jailbreaks safety-aligned LLMs like LLaMA-3 and Mistral by redirecting attention in safety-heavy heads with as few as 5 tokens, achieving 30-36% attack success while ablating the same heads barely affects refusals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}