{"total":37,"items":[{"citing_arxiv_id":"2606.30810","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Knowledge Alignment in Code LLMs: Contrastive Unlearning for Evolving APIs","primary_cat":"cs.SE","submitted_at":"2026-06-29T18:34:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CURE applies contrastive unlearning to reduce deprecated API usage in code LLMs and improve correct replacements on a benchmark dataset while preserving general performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25198","ref_index":72,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Heuresis: Search Strategies for Autonomous AI Research Agents Across Quality, Diversity and Novelty","primary_cat":"cs.AI","submitted_at":"2026-06-23T21:44:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Heuresis evaluates six search strategies for autonomous ML research agents and finds that novel ideas are rare, none rated original, and only one reaches top-10 quality while strategies steer axes but do not expand the quality-novelty frontier.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19222","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mechanism-Guided Selective Unlearning for RLVR-Induced Reasoning","primary_cat":"cs.LG","submitted_at":"2026-06-17T15:59:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAST ranks attention-projection tensors by off-principal energy, update magnitude, and forget-gradient coupling to selectively unlearn RLVR-induced reasoning, achieving significant forgetting on MATH while preserving GSM8K and retain MATH unlike full-parameter updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17250","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Groups in Critic-Free RLVR","primary_cat":"cs.LG","submitted_at":"2026-06-15T19:49:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Negative token filtering enables single-rollout critic-free RL training by avoiding false penalties on negative samples, matching group-based methods on reasoning tasks and exceeding them on agentic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17168","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RepSelect: Robust LLM Unlearning via Representation Selectivity","primary_cat":"cs.CL","submitted_at":"2026-06-15T18:06:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RepSelect isolates forget-set-specific representations via gradient PCA collapse to achieve 4-50x better post-relearning robustness than baselines across multiple models and forget categories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12841","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TimeROME-DLM: Temporal Causal Tracing and Low-Rank Inference-Time Knowledge Editing for Masked Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-06-11T03:09:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TimeROME-DLM enables training-free knowledge editing in masked diffusion language models via temporal causal tracing and low-rank residual edit memory applied at inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10989","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Null-Space Constrained Low-Rank Adaptation for Response-Specified Large Language Model Unlearning","primary_cat":"cs.AI","submitted_at":"2026-06-09T15:26:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NSRU constrains LoRA updates via null-space projection of retain subspaces to jointly optimize safe-target learning, undesired-response suppression, and retention in LLM unlearning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02293","ref_index":75,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AI as a Tool for Simulation-Based Experiments in Literary Studies","primary_cat":"cs.CL","submitted_at":"2026-06-01T14:16:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes AI-driven simulations for literary-historical experiments and reports preliminary text-generation results claiming the first limited in-distribution outputs matching human novels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30514","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MAAT: Multi-phase Adapter-Aware Targeted Unlearning","primary_cat":"cs.LG","submitted_at":"2026-05-28T19:52:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces 5WBENCH balanced benchmark across 5W categories and MAAT three-phase adapter unlearning method that targets causal Why-type knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00105","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Visual-Noise Guided In-Context Distillation for Multimodal Large Language Model Unlearning","primary_cat":"cs.CV","submitted_at":"2026-05-26T12:49:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VGID constructs an intervention-induced teacher distribution via visual perturbation plus textual in-context unlearning and distills it into the student MLLM to achieve parameter-level forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20915","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Calibration vs Decision Making: Revisiting the Reliability Paradox in Unlearned Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T08:59:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Unlearned language models retain low calibration error but show increased shortcut reliance on the TOFU benchmark, extending the reliability paradox to machine unlearning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18253","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Machine Unlearning for Masked Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-18T11:54:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MDU minimizes forward KL divergence from prompt-conditional to prompt-masked unconditional predictions at masked positions to unlearn knowledge in MDLMs while trading off privacy and utility via temperature scaling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15687","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ASRU: Activation Steering Meets Reinforcement Unlearning for Multimodal Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-15T07:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ASRU combines activation redirection and reward-optimized fine-tuning to unlearn cross-modal sensitive knowledge in MLLMs, reporting +24.6% better unlearning effectiveness and 5.8x higher generation quality on Qwen3-VL while preserving utility with limited retained data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14636","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Teaching Large Language Models When Not to Know: Learning Temporal Critique for Ex-Ante Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-14T09:49:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TCFT trains LLMs on temporal critique tasks to reduce post-cutoff knowledge leakage by 37-42 percentage points over prompting and standard SFT on Qwen models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14514","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Defenses at Odds: Measuring and Explaining Defense Conflicts in Large Language Models","primary_cat":"cs.CR","submitted_at":"2026-05-14T07:58:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Sequential LLM defense deployment leads to risk exacerbation in 38.9% of cases due to anti-aligned updates in shared critical layers, addressed by conflict-guided layer freezing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14404","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Knowledge Beyond Language: Bridging the Gap in Multilingual Machine Unlearning Evaluation","primary_cat":"cs.CL","submitted_at":"2026-05-14T05:45:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"New metrics KSS and KPS are introduced to evaluate multilingual machine unlearning quality and cross-language consistency in LLMs, addressing limitations of single-language evaluation protocols.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13595","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Inducing Artificial Uncertainty in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-13T14:30:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Inducing artificial uncertainty on trivial tasks allows training probes that achieve higher calibration on hard data than standard approaches while retaining performance on easy data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08800","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PPU-Bench:Real World Benchmark for Personalized Partial Unlearning in Vision Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-09T08:46:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PPU-Bench is a real-world benchmark exposing forget-retain trade-offs in MLLM unlearning and motivating Boundary-Aware Optimization to enforce intra-subject factual boundaries.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"QA CU SU PU MMUBench [7] Real world Concept-level - 20 1K 2K✓ ✓ ✗ ✗ MLLMU [15] Synthetic Private data✗500 1.2K 20.7K✗ ✓ ✗ ✗ PEBench [23] Synthetic Identities&events ✗200 8K 16K✗ ✓ ✗ ✗ CLEAR [3] Synthetic Identity✗200 3.7K 4K✗ ✓ ✗ ✗ UMU-bench [22] Synthetic Private data✗500 1.2K 20.7K✗ ✓ ✗ ✗ FIU-bench [17] Synthetic Identity✗400 0.4K 8K✓ ✓ ✗ ✗ OFFSIDE [26] Real&Synthetic Football rumors✗80 0.6K 15.7K✗ ✓ ✓ ✗ PPU-Bench (ours) Real world Profile information✓500 2K 24K✓ ✓ ✓ ✓ training corpora. Thus, machine unlearning has emerged as a practical alternative, aiming to remove specific knowledge from a trained model while preserving its overall utility. Recent studies increasingly focus on machine unlearning in multimodal scenarios."},{"citing_arxiv_id":"2605.08765","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unlearners Can Lie: Evaluating and Improving Honesty in LLM Unlearning","primary_cat":"cs.LG","submitted_at":"2026-05-09T07:50:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Existing LLM unlearning methods fail honesty standards by hallucinating on forgotten knowledge; ReVa improves rejection rates nearly twofold while enhancing retained honesty.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07242","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MEMOREPAIR: Barrier-First Cascade Repair in Agentic Memory","primary_cat":"cs.AI","submitted_at":"2026-05-08T04:57:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemoRepair formalizes the cascade update problem in agentic memory and solves it via a min-cut reduction that eliminates invalidated memory exposure to 0% while recovering 91-94% of valid successors at 57-76% of baseline repair cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05938","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ICU-Bench:Benchmarking Continual Unlearning in Multimodal Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-07T09:46:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ICU-Bench is a new continual unlearning benchmark for MLLMs using 1000 privacy profiles, 9500 images, and 100 forget tasks, showing existing methods fail to balance forgetting, utility, and scalability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05909","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Null Space Constrained Contrastive Visual Forgetting for MLLM Unlearning","primary_cat":"cs.AI","submitted_at":"2026-05-07T09:18:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A contrastive visual forgetting technique constrained to the null space of retained knowledge enables targeted unlearning of visual concepts in MLLMs while preserving non-target visual and all textual knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04653","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Threshold-Guided Optimization for Visual Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-06T08:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A threshold-guided alignment method lets visual generative models be optimized directly from scalar human ratings instead of requiring paired preference data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02196","ref_index":8,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DurableUn: Quantization-Induced Recovery Attacks in Machine Unlearning","primary_cat":"cs.LG","submitted_at":"2026-05-04T03:54:14+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"INT4 quantization recovers up to 22 times more forgotten training data in unlearned LLMs, and the proposed DURABLEUN-SAF method is the first to maintain forgetting across BF16, INT8, and INT4 precisions.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Machine unlearning has been extensively studied through gradient -based, data-perturbation, and representation-level approaches. Gradient Ascent (GA) [ 6] directly maximizes the forget loss, while SCRUB [7] regularizes this objective with a Kullback-Leibler (KL) divergence term from a frozen reference model. Preference-based methods such as Negative Preference Optimization (NPO) [ 8] optimize against undesired outputs, whereas Saliency-based Unlearning (SalUn) [ 9] suppresses influential parameters via gradient -based masking. Representation-level approaches including Representation Manipulation Unlearning (RMU) [10] and AlphaEdit [11] modify internal feature spaces, while Gradient Difference regularization (GradDiff) [ 12] constrains divergence on"},{"citing_arxiv_id":"2605.01129","ref_index":54,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting Privacy Leakage in Machine Unlearning: Membership Inference Beyond the Forgotten Set","primary_cat":"cs.CR","submitted_at":"2026-05-01T21:57:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TC-UMIA is a population-level attack using pre- and post-unlearning predictions to infer membership across forget, retain, and unseen sets, revealing added privacy leakage to retained data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22076","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PrivUn: Unveiling Latent Ripple Effects and Shallow Forgetting in Privacy Unlearning","primary_cat":"cs.LG","submitted_at":"2026-04-23T21:01:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PrivUn shows privacy unlearning in LLMs produces gradient-driven ripple effects and only shallow forgetting across layers, with new strategies proposed for deeper removal.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21571","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Separable Expert Architecture: Toward Privacy-Preserving LLM Personalization via Composable Adapters and Deletable User Proxies","primary_cat":"cs.AI","submitted_at":"2026-04-23T11:51:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A separable expert architecture uses base models, LoRA adapters, and deletable per-user proxies to enable privacy-preserving personalization and deterministic unlearning in LLMs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Exact unlearning methods like SISA [5] require maintaining independently trained model shards, while approximate methods offer no formal re- moval guarantees [6]. LLM-specific approaches face additional difficulties: Gradient ascent can cause catas- trophic collapse in certain unlearning configurations [7], and representation-level methods like RMU [8] still mod- ify shared weights. This problem is compounded by extraction attacks, including model inversion [9], train- ing data extraction [10, 11], and membership inference [12], which can recover private information from weight- encoded personalization,making it a privacy issue even absent deletion requests. To illustrate this, consider a personalized assistant that has learned a user's medical"},{"citing_arxiv_id":"2604.18966","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Improving Tabular Language Models via Iterative Reward-Guided Post-Training","primary_cat":"cs.LG","submitted_at":"2026-04-21T01:29:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TabGRAA applies group-relative advantage alignment in an iterative reward-guided post-training loop to improve tabular language model generators on fidelity, utility, and privacy trade-offs across five benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"θ = 1 B X y∈Bhigh rθ(y),¯r low θ = 1 B X y∈Blow rθ(y).(6) Extending the Bradley-Terry preference model from sam- ple pairs togroup pairs, the probability that group Bhigh is preferred toB low is P(B high ≻ Blow |θ) =σ ¯rhigh θ −¯rlow θ \u0001 ,(7) whereσ(x) = (1 +e −x)−1. Thegroup relative advantage is the logit of (7): ∆group(θ) = ¯r high θ −¯rlow θ .(8) GRAA loss.We take the GRAA loss to be the sigmoid on the negative group advantage: LGRAA(θ) =σ −∆group(θ) \u0001 =σ ¯rlow θ −¯rhigh θ \u0001 . (9) Minimizing (9) is equivalent to maximizing the group pref- erence probability (7) (both push ∆group upward). The only hyperparameter is β, which is absorbed into rθ. Compared to the earlier formulation with three tunable coefficients"},{"citing_arxiv_id":"2604.07962","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Is your algorithm unlearning or untraining?","primary_cat":"cs.LG","submitted_at":"2026-04-09T08:24:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Machine unlearning conflates reversing the influence of specific training examples (untraining) with removing the full underlying distribution or behavior (unlearning).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.23798","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MPU: Towards Secure and Privacy-Preserving Knowledge Unlearning for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-02-27T08:39:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MPU is a framework that achieves privacy-preserving unlearning for LLMs by distributing perturbed model copies for local client-side unlearning followed by server-side aggregation with harmonic denoising.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.19728","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hard Negative Sample-Augmented DPO Post-Training for Small Language Models","primary_cat":"cs.LG","submitted_at":"2025-12-17T06:15:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A six-dimensional MathVerifier supplies hard negatives and per-sample weights that improve DPO performance on math reasoning for a 1.5B Qwen2.5 model over standard SFT and unweighted DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.12469","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sparse Concept Anchoring for Interpretable and Controllable Neural Representations","primary_cat":"cs.LG","submitted_at":"2025-12-13T21:43:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sparse Concept Anchoring biases neural latent spaces toward targeted concepts using under 0.1% labels per concept, enabling reversible steering via projection and permanent removal via weight ablation with minimal side effects on other features.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.00778","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Is Preference Optimization Doing, and Why?","primary_cat":"cs.LG","submitted_at":"2025-11-30T08:27:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Gradient analysis and ablations show DPO and PPO have different target directions and component roles in preference optimization for LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.00761","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Downgrade to Upgrade: Optimizer Simplification Enhances Robustness in LLM Unlearning","primary_cat":"cs.LG","submitted_at":"2025-10-01T10:50:14+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Downgrading optimizers to lower-information variants during LLM unlearning yields more robust forgetting on MUSE and WMDP benchmarks by converging to harder-to-perturb loss basins.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.20941","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting the Past: Data Unlearning with Model State History","primary_cat":"cs.LG","submitted_at":"2025-06-26T02:16:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MSA performs data unlearning in LLMs by arithmetic operations on prior model checkpoints to remove targeted datapoint influence, with experiments showing competitive or better results than existing unlearning methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.16831","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unlearning Isn't Deletion: Investigating Reversibility of Machine Unlearning in LLMs","primary_cat":"cs.CL","submitted_at":"2025-05-22T16:02:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Machine unlearning in LLMs is often reversible via fine-tuning, indicating suppression not deletion, and a new representation-level framework identifies four forgetting regimes based on reversibility and catastrophicity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2409.12917","ref_index":89,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training Language Models to Self-Correct via Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2024-09-19T17:16:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCoRe uses multi-turn online RL with regularization on self-generated traces to improve LLM self-correction, achieving 15.6% and 9.1% gains on MATH and HumanEval for Gemini models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}