{"total":33,"items":[{"citing_arxiv_id":"2605.12813","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"REALISTA: Realistic Latent Adversarial Attacks that Elicit LLM Hallucinations","primary_cat":"cs.CL","submitted_at":"2026-05-12T23:13:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"REALISTA optimizes continuous combinations of valid editing directions in latent space to produce realistic adversarial prompts that elicit hallucinations more effectively than prior methods, including on large reasoning models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12809","ref_index":248,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Correcting Influence: Unboxing LLM Outputs with Orthogonal Latent Spaces","primary_cat":"cs.LG","submitted_at":"2026-05-12T23:01:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A latent mediation framework with sparse autoencoders enables non-additive token-level influence attribution in LLMs by learning orthogonal features and back-propagating attributions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12412","ref_index":86,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stories in Space: In-Context Learning Trajectories in Conceptual Belief Space","primary_cat":"cs.CL","submitted_at":"2026-05-12T17:09:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs perform in-context learning as trajectories through a structured low-dimensional conceptual belief space, with the structure visible in both behavior and internal representations and causally manipulable via interventions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09195","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Geometry of Forgetting: Temporal Knowledge Drift as an Independent Axis in LLM Representations","primary_cat":"cs.AI","submitted_at":"2026-05-09T22:27:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Temporal knowledge drift is encoded as a geometrically orthogonal direction in LLM residual streams, independent of correctness and uncertainty.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09011","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Geometric Perspective on Next-Token Prediction in Large Language Models: Three Emerging Phases","primary_cat":"cs.LG","submitted_at":"2026-05-09T15:51:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs exhibit three geometric phases in next-token prediction—seeding multiplexing, hoisting overriding, and focal convergence—where predictive subspaces rise, stabilize, and converge across layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07990","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Tool Calling is Linearly Readable and Steerable in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-08T16:47:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tool identity is linearly readable and steerable in LLMs via mean activation differences, with 77-100% switch accuracy and error prediction from activation gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07922","ref_index":17,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tree SAE: Learning Hierarchical Feature Structures in Sparse Autoencoders","primary_cat":"cs.LG","submitted_at":"2026-05-08T15:57:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tree SAE learns hierarchical feature structures by combining activation coverage with a new reconstruction condition, outperforming prior SAEs on hierarchical pair detection while matching state-of-the-art benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07407","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Emergent Symbolic Structure in Health Foundation Models: Extraction, Alignment, and Cross-Modal Transfer","primary_cat":"cs.LG","submitted_at":"2026-05-08T08:03:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Health foundation model embeddings contain an interpretable symbolic organization shared across modalities that supports cross-domain transfer without joint training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07148","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Uncovering and Shaping the Latent Representation of 3D Scene Topology in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-08T02:32:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLMs possess a latent 3D scene topology subspace corresponding to Laplacian eigenmaps that can be causally shaped via Dirichlet energy regularization to improve spatial task performance by up to 12.1%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05715","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Decodable but Not Corrected by Fixed Residual-Stream Linear Steering: Evidence from Medical LLM Failure Regimes","primary_cat":"cs.AI","submitted_at":"2026-05-07T05:58:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Overthinking in medical QA is linearly decodable at 71.6% accuracy yet fixed residual-stream steering yields no correction across 29 configurations, while enabling selective abstention with AUROC 0.610.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05653","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Negative Before Positive: Asymmetric Valence Processing in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-07T04:09:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Negative valence localizes to early layers and positive valence to mid-to-late layers in LLMs, with the directions being causally steerable.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05443","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SLAM: Structural Linguistic Activation Marking for Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-06T21:11:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SLAM achieves 100% detection on Gemma-2 models with only 1-2 point quality cost by causally steering SAE-identified residual-stream directions for linguistic structure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05115","ref_index":204,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Manifold Steering Reveals the Shared Geometry of Neural Network Representation and Behavior","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:46:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Manifold steering along activation geometry induces behavioral trajectories matching the natural manifold of outputs, while linear steering produces off-manifold unnatural behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03258","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Right Answer, the Wrong Direction: Why Transformers Fail at Counting and How to Fix It","primary_cat":"cs.LG","submitted_at":"2026-05-05T01:13:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Transformers encode counts correctly internally but fail to read them out due to misalignment with digit output directions, fixable by updating 37k output parameters or small LoRA on attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03160","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Pairwise matrices for sparse autoencoders: single-feature inspection mislabels causal axes","primary_cat":"cs.LG","submitted_at":"2026-05-04T21:11:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pairwise matrices for SAEs demonstrate that single-feature inspection mislabels causal axes, with joint suppression and matched-geometry controls revealing distinct output regimes not captured by single-feature or random perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27169","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Semantic Structure of Feature Space in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-29T20:17:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM hidden states encode semantic features whose geometric relations, including axis projections, cosine similarities, low-dimensional subspaces, and steering spillovers, closely mirror human psychological associations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21691","ref_index":205,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"There Will Be a Scientific Theory of Deep Learning","primary_cat":"stat.ML","submitted_at":"2026-04-23T13:58:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A mechanics of the learning process is emerging in deep learning theory, characterized by dynamics, coarse statistics, and falsifiable predictions across idealized settings, limits, laws, hyperparameters, and universal behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19052","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cell-Based Representation of Relational Binding in Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-21T03:58:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Large language models encode relational bindings via a cell-based representation: a low-dimensional linear subspace in which each cell corresponds to an entity-relation index pair and attributes are retrieved from the matching cell.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19018","ref_index":90,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Local Linearity of LLMs Enables Activation Steering via Model-Based Linear Optimal Control","primary_cat":"cs.LG","submitted_at":"2026-04-21T03:09:46+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Local linearity of LLM layers enables LQR-based closed-loop activation steering with theoretical tracking guarantees.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18901","ref_index":9,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Harmful Intent as a Geometrically Recoverable Feature of LLM Residual Streams","primary_cat":"cs.LG","submitted_at":"2026-04-20T23:02:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Harmful intent is linearly separable in LLM residual streams across 12 models and multiple architectures, reaching mean AUROC 0.982 while showing protocol-dependent directions and strong generalization to held-out harm benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18519","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLM Safety From Within: Detecting Harmful Content with Internal Representations","primary_cat":"cs.AI","submitted_at":"2026-04-20T17:17:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIREN identifies safety neurons via linear probing on internal LLM layers and combines them with adaptive weighting to detect harm, outperforming prior guard models with 250x fewer parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17614","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Characterizing Model-Native Skills","primary_cat":"cs.AI","submitted_at":"2026-04-19T20:58:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Recovering an orthogonal basis from model activations yields a model-native skill characterization that improves reasoning Pass@1 by up to 41% via targeted data selection and supports inference steering, outperforming human-characterized alternatives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14128","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rhetorical Questions in LLM Representations: A Linear Probing Study","primary_cat":"cs.CL","submitted_at":"2026-04-15T17:50:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Linear probes show rhetorical questions are encoded via multiple dataset-specific directions in LLM representations, with low cross-probe agreement on the same data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14090","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Weights to Activations: Is Steering the Next Frontier of Adaptation?","primary_cat":"cs.CL","submitted_at":"2026-04-15T17:06:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Steering is positioned as a distinct adaptation paradigm that uses targeted activation interventions for local, reversible behavioral changes without parameter updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00847","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"H-Probes: Extracting Hierarchical Structures From Latent Representations of Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-15T00:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"H-probes locate low-dimensional subspaces encoding hierarchy in LLM activations for synthetic tree tasks, show causal importance and generalization, and detect weaker signals in mathematical reasoning traces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11050","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Shared Emotion Geometry Across Small Language Models: A Cross-Architecture Study of Representation, Behavior, and Methodological Confounds","primary_cat":"cs.CL","submitted_at":"2026-04-13T06:27:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mature small language models share nearly identical 21-emotion geometries across architectures with Spearman correlations 0.74-0.92 despite opposite behavioral profiles, while immature models restructure under RLHF and prior comprehension-generation differences decompose into four distinct layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08846","ref_index":78,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Dictionary-Aligned Concept Control for Safeguarding Multimodal LLMs","primary_cat":"cs.LG","submitted_at":"2026-04-10T01:01:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DACO curates a 15,000-concept dictionary from 400K image-caption pairs and uses it to initialize an SAE that enables granular, concept-specific steering of MLLM activations, raising safety scores on MM-SafetyBench and JailBreakV while preserving general capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02914","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Safety Geometry Collapses: Fine-Tuning Vulnerabilities in Agentic Guard Models","primary_cat":"cs.LG","submitted_at":"2026-04-08T05:27:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Benign fine-tuning collapses safety geometry in guard models like Granite Guardian, dropping refusal to 0%, but Fisher-Weighted Safety Subspace Regularization restores it to 75% while improving robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06377","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Master Key Hypothesis: Unlocking Cross-Model Capability Transfer via Linear Subspace Alignment","primary_cat":"cs.LG","submitted_at":"2026-04-07T19:02:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The Master Key Hypothesis states that capabilities are low-dimensional directions transferable across models through linear subspace alignment, with UNLOCK demonstrating gains such as 12.1% accuracy improvement on MATH when transferring CoT from 14B to 7B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02608","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Steerable but Not Decodable: Function Vectors Operate Beyond the Logit Lens","primary_cat":"cs.LG","submitted_at":"2026-04-03T00:54:11+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Function vectors steer LLMs successfully where the logit lens fails to decode the target answer, showing the two properties come apart.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.11717","ref_index":169,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Refusal in Language Models Is Mediated by a Single Direction","primary_cat":"cs.LG","submitted_at":"2024-06-17T16:36:12+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Refusal in language models is mediated by a single direction in residual stream activations that can be erased to disable safety or added to elicit refusal.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2312.06681","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Steering Llama 2 via Contrastive Activation Addition","primary_cat":"cs.CL","submitted_at":"2023-12-09T04:40:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contrastive Activation Addition steers Llama 2 Chat by adding averaged residual-stream activation differences from contrastive example pairs to control targeted behaviors at inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.10248","ref_index":145,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Steering Language Models With Activation Engineering","primary_cat":"cs.CL","submitted_at":"2023-08-20T12:21:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Activation Addition steers language models by adding contrastive activation vectors from prompt pairs to control high-level properties like sentiment and toxicity at inference time without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}