{"total":13,"items":[{"citing_arxiv_id":"2606.00140","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Geometric Erasure by Contrastive Velocity Matching in Rectified Flows","primary_cat":"cs.LG","submitted_at":"2026-05-29T00:02:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"GEM bridges trajectory-based unlearning and teacher-guided erasure to create a geometric guidance objective for targeted concept suppression in Rectified Flow models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29390","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Orthogonal Negative Guidance in Attention Feature Space for Text-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T05:43:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Orthogonal Negative Guidance subtracts only the orthogonal component of negative-prompt attention features from positive ones in FLUX models to suppress concepts while preserving semantics and quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19739","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FlowErase-RL: Rethinking Concept Erasure as Reward Optimization in Flow Matching Models","primary_cat":"cs.CV","submitted_at":"2026-05-19T12:10:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18719","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SafeDiffusion-R1: Online Reward Steering for Safe Diffusion Post-Training","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:50:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SafeDiffusion-R1 uses online GRPO with CLIP embedding steering to cut inappropriate content from 48.9% to 18.07% and nudity detections from 646 to 15 in diffusion models while raising GenEval scores from 42.08% to 47.83% and generalizing across seven harm categories without supervised pairs or extra","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01113","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Disciplined Diffusion: Text-to-Image Diffusion Model against NSFW Generation","primary_cat":"cs.CV","submitted_at":"2026-05-01T21:33:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DDiffusion uses semantic retrieval on prompt embeddings and localized editing inside the diffusion process to suppress NSFW content while avoiding binary allow/block signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25119","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evaluation without Generation: Non-Generative Assessment of Harmful Model Specialization with Applications to CSAM","primary_cat":"cs.LG","submitted_at":"2026-04-28T01:54:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Gaussian probing infers harmful model specialization from parameter perturbations and internal representation responses to Gaussian latent ensembles rather than from generated outputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10032","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Closed-Form Concept Erasure via Double Projections","primary_cat":"cs.LG","submitted_at":"2026-04-11T05:06:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A training-free double-projection linear transformation erases target concepts from generative models by computing a proxy projection then applying a constrained update in the left null space of known directions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09253","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mosaic: Multimodal Jailbreak against Closed-Source VLMs via Multi-View Ensemble Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-10T12:09:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mosaic combines text perturbation, multi-view image optimization, and surrogate model ensembles to reduce reliance on any single open-source model and achieve higher attack success rates on commercial closed-source VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09213","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SHIFT: Steering Hidden Intermediates in Flow Transformers","primary_cat":"cs.CV","submitted_at":"2026-04-10T11:07:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SHIFT learns and applies steering vectors to selected layers and timesteps in DiT models to suppress concepts, shift styles, or bias objects while keeping image quality and prompt adherence intact.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06285","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Harnessing Hyperbolic Geometry for Harmful Prompt Detection and Sanitization","primary_cat":"cs.CR","submitted_at":"2026-04-07T12:49:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HyPE detects harmful prompts as outliers in hyperbolic space and HyPS sanitizes them using explainable attribution, outperforming prior defenses in accuracy and robustness across datasets and adversarial scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04575","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Erasure or Erosion? Evaluating Compositional Degradation in Unlearned Text-To-Image Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T10:16:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unlearning methods that strongly erase concepts from text-to-image diffusion models consistently degrade performance on attribute binding, spatial reasoning, and counting tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.14194","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VLBiasBench: A Comprehensive Benchmark for Evaluating Bias in Large Vision-Language Model","primary_cat":"cs.CV","submitted_at":"2024-06-20T10:56:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLBiasBench is a new large-scale benchmark with 128,342 samples covering nine social bias categories plus two intersectional ones to evaluate biases in LVLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.12508","ref_index":137,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SalUn: Empowering Machine Unlearning via Gradient-based Weight Saliency in Both Image Classification and Generation","primary_cat":"cs.LG","submitted_at":"2023-10-19T06:17:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SalUn uses gradient-based weight saliency to achieve effective machine unlearning of data, classes, or concepts in image classification and generation, narrowing the gap to exact retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}