{"total":20,"items":[{"citing_arxiv_id":"2605.12809","ref_index":215,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Correcting Influence: Unboxing LLM Outputs with Orthogonal Latent Spaces","primary_cat":"cs.LG","submitted_at":"2026-05-12T23:01:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A latent mediation framework with sparse autoencoders enables non-additive token-level influence attribution in LLMs by learning orthogonal features and back-propagating attributions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11134","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spurious Correlation Learning in Preference Optimization: Mechanisms, Consequences, and Mitigation via Tie Training","primary_cat":"cs.LG","submitted_at":"2026-05-11T18:41:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Standard preference learning induces spurious feature reliance via mean bias and correlation leakage, creating irreducible distribution shift vulnerabilities that tie training mitigates without degrading causal learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10937","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Power Reinforcement Post-Training of Text-to-Image Models with Super-Linear Advantage Shaping","primary_cat":"cs.CV","submitted_at":"2026-05-11T17:59:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Super-Linear Advantage Shaping (SLAS) introduces a non-linear geometric policy update for RL post-training of text-to-image models that reshapes the local policy space via advantage-dependent Fisher-Rao weighting to reduce reward hacking and improve performance over GRPO baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08556","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Can Revealed Preferences Clarify LLM Alignment and Steering?","primary_cat":"cs.LG","submitted_at":"2026-05-08T23:26:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs show partial internal coherence in medical decisions but frequently fail to accurately report their preferences or adopt user-directed ones via prompting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08378","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reinforcement Learning for Scalable and Trustworthy Intelligent Systems","primary_cat":"cs.LG","submitted_at":"2026-05-08T18:36:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Reinforcement learning is advanced for communication-efficient federated optimization and for preference-aligned, contextually safe policies in large language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07724","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Curated Synthetic Data Doesn't Have to Collapse: A Theoretical Study of Generative Retraining with Pluralistic Preferences","primary_cat":"cs.LG","submitted_at":"2026-05-08T13:27:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Recursive generative retraining with pluralistic preferences converges to a stable diverse distribution that satisfies a weighted Nash bargaining solution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07063","ref_index":124,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Dr. Post-Training: A Data Regularization Perspective on LLM Post-Training","primary_cat":"cs.LG","submitted_at":"2026-05-08T00:16:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dr. Post-Training reframes general data as a data-induced regularizer for LLM post-training updates, yielding a family of methods that outperform data-selection baselines on SFT, RLHF, and RLVR tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02495","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Efficient Preference Poisoning Attack on Offline RLHF","primary_cat":"cs.LG","submitted_at":"2026-05-04T11:45:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Label-flip attacks on log-linear DPO reduce to binary sparse approximation problems that can be solved efficiently by lattice-based and binary matching pursuit methods with recovery guarantees.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01954","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Moira: Language-driven Hierarchical Reinforcement Learning for Pair Trading","primary_cat":"cs.AI","submitted_at":"2026-05-03T16:37:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moira parameterizes hierarchical RL policies for pair trading with LLMs and adapts them via prompt updates based on trajectory and episode feedback, outperforming baselines on real market data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.28010","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning from Disagreement: Clinician Overrides as Implicit Preference Signals for Clinical AI in Value-Based Care","primary_cat":"cs.LG","submitted_at":"2026-04-30T15:30:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Clinician overrides of AI recommendations provide implicit preference signals for training clinical AI, addressed via a new framework with five-category taxonomy, patient-state and clinician-capability conditioned preferences, and dual reward-capability learning to prevent suppression bias.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25895","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Three Models of RLHF Annotation: Extension, Evidence, and Authority","primary_cat":"cs.CY","submitted_at":"2026-04-28T17:39:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RLHF should decompose annotations into dimensions each matched to one of three models—extension, evidence, or authority—instead of applying a single unified pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21216","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Post-AGI Economies: Autonomy and the First Fundamental Theorem of Welfare Economics","primary_cat":"econ.TH","submitted_at":"2026-04-23T02:13:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The First Fundamental Theorem of Welfare Economics holds for autonomy-complete competitive equilibria that are autonomy-Pareto efficient, with the classical version recovered in the low-autonomy limit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19024","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Policy Gradient Primal-Dual Method for Safe Reinforcement Learning from Human Feedback","primary_cat":"cs.LG","submitted_at":"2026-04-21T03:20:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Primal-dual policy gradient algorithms achieve global non-asymptotic convergence for safe RLHF cast as infinite-horizon discounted CMDPs without fitting reward models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17587","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AIRA: AI-Induced Risk Audit: A Structured Inspection Framework for AI-Generated Code","primary_cat":"cs.SE","submitted_at":"2026-04-19T19:32:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AIRA is a 15-check audit framework that finds AI-generated code has 1.8 times more high-severity failure-untruthful patterns than human-written code in a matched replication study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13803","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Gaslight, Gatekeep, V1-V3: Early Visual Cortex Alignment Shields Vision-Language Models from Sycophantic Manipulation","primary_cat":"cs.CV","submitted_at":"2026-04-15T12:38:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Alignment of vision-language models with human V1-V3 early visual cortex negatively predicts resistance to sycophantic gaslighting attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10134","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PlanGuard: Defending Agents against Indirect Prompt Injection via Planning-based Consistency Verification","primary_cat":"cs.CR","submitted_at":"2026-04-11T09:59:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PlanGuard cuts indirect prompt injection attack success rate to 0% on the InjecAgent benchmark by verifying agent actions against a user-instruction-only plan while keeping false positives at 1.49%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07754","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Art of (Mis)alignment: How Fine-Tuning Methods Effectively Misalign and Realign LLMs in Post-Training","primary_cat":"cs.CR","submitted_at":"2026-04-09T03:20:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ORPO is most effective at misaligning LLMs while DPO excels at realigning them, though it reduces utility, revealing an asymmetry between attack and defense methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06621","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Theorems of Dr. David Blackwell and Their Contributions to Artificial Intelligence","primary_cat":"cs.GL","submitted_at":"2026-04-08T03:01:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Blackwell's Rao-Blackwell, Approachability, and Informativeness theorems provide frameworks for variance reduction, sequential decisions under uncertainty, and comparing information sources that remain relevant to AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02686","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Semantic Manipulation: Token-Space Attacks on Reward Models","primary_cat":"cs.LG","submitted_at":"2026-04-03T03:30:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TOMPA performs black-box adversarial optimization in token space to discover non-linguistic patterns that nearly double the reward scores of GPT-5 answers on Skywork-Reward-V2 while producing gibberish text.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07974","ref_index":140,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","primary_cat":"cs.SE","submitted_at":"2024-03-12T17:58:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiveCodeBench collects 400 recent contest problems to create a contamination-free benchmark evaluating LLMs on code generation and related capabilities like self-repair and execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}