{"total":12,"items":[{"citing_arxiv_id":"2606.08454","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Linear Activation Steering: Invertible Latent Transformations for Controlling LLM Behavior","primary_cat":"cs.LG","submitted_at":"2026-06-07T05:01:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"INNSteer learns an invertible neural network to map LLM activations into a latent space where linear steering becomes more effective, then applies the inverse map to produce nonlinear interventions in the original space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07696","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Adversarial Robustness of Activation Steering in Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-06-05T07:40:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"First systematic test shows activation steering robustness drops sharply (up to 64%) under adversarial input perturbations across multiple extraction methods, models, and personas.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03093","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Decomposing how prompting steers behavior","primary_cat":"cs.AI","submitted_at":"2026-06-02T03:27:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A geometric decomposition framework shows that affine transformations best recover prompt-induced task geometry and behavior in language and vision models across multiple datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00460","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SALSA: Speech Aware LLM Adaptation via Learned Steering Activation Vectors","primary_cat":"cs.CL","submitted_at":"2026-05-30T00:54:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SALSA adapts speech-aware LLMs via supervised layer-wise steering vectors, reporting up to 46.8% relative gains over zero-shot on out-of-domain speech benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28664","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Activation Steering for Synthetic Data Generation: The Role of Diversity in Downstream Safety Detection","primary_cat":"cs.LG","submitted_at":"2026-05-27T15:59:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Activation steering produces synthetic safety-violating data that improves downstream classifiers over prompting on most tested concepts when a harmonic mean of alignment, coherence, and diversity is optimized.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23040","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Steered Generation via Gradient-Based Optimization on Sparse Query Features","primary_cat":"cs.LG","submitted_at":"2026-05-21T21:13:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Prototype-Based Sparse Steering decomposes query activations with SAEs and optimizes sparse features via gradients to steer LLM outputs toward specific behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17231","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FishBack: Pullback Fisher Geometry for Optimal Activation Steering in Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-17T03:00:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FishBack derives a closed-form minimum-distortion steering direction from the pullback Fisher metric of the softmax layer, outperforming Euclidean baselines on GPT-2 verb-morphology tasks with lower off-target KL divergence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15604","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VSPO: Vector-Steered Policy Optimization for Behavioral Control","primary_cat":"cs.LG","submitted_at":"2026-05-15T04:31:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VSPO samples rollouts at varying steering intensities to improve behavioral control in LLMs while preserving task accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12890","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Steer-to-Detect: Probing Hidden Representations for Detection of LLM-Generated Texts","primary_cat":"stat.AP","submitted_at":"2026-05-13T02:14:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Steer-to-Detect learns a steering vector injected into LLM hidden states to boost class separability and applies hypothesis testing with finite-sample Type I/II error guarantees for generated-text detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12412","ref_index":75,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stories in Space: In-Context Learning Trajectories in Conceptual Belief Space","primary_cat":"cs.CL","submitted_at":"2026-05-12T17:09:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs perform in-context learning as trajectories through a structured low-dimensional conceptual belief space, with the structure visible in both behavior and internal representations and causally manipulable via interventions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16362","ref_index":17,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Is Rank-1 Steering Cheap? Geometry, Granularity, and Budgeted Search","primary_cat":"cs.LG","submitted_at":"2026-05-09T14:26:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Prompt-boundary directional alignment enables geometry-guided search that cuts trials to 95% best utility by 39.8% on average, while concept granularity predicts remaining difficulty via directional heterogeneity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.24535","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Linear Steering: Unified Multi-Attribute Control for Language Models","primary_cat":"cs.LG","submitted_at":"2025-05-30T12:41:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"K-Steering uses a non-linear multi-label classifier on activations to compute gradient-based intervention directions for unified multi-attribute control in LLMs, outperforming linear baselines on ToneBank and DebateMix benchmarks across three model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}