{"total":20,"items":[{"citing_arxiv_id":"2605.10937","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Power Reinforcement Post-Training of Text-to-Image Models with Super-Linear Advantage Shaping","primary_cat":"cs.CV","submitted_at":"2026-05-11T17:59:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Super-Linear Advantage Shaping (SLAS) introduces a non-linear geometric policy update for RL post-training of text-to-image models that reshapes the local policy space via advantage-dependent Fisher-Rao weighting to reduce reward hacking and improve performance over GRPO baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09291","ref_index":130,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"dFlowGRPO: Rate-Aware Policy Optimization for Discrete Flow Models","primary_cat":"cs.LG","submitted_at":"2026-05-10T03:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dFlowGRPO is a new rate-aware RL method for discrete flow models that outperforms prior GRPO approaches on image generation and matches continuous flow models while supporting broad probability paths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09125","ref_index":39,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Transfer Learning of Multiobjective Indirect Low-Thrust Trajectories Using Diffusion Models and Markov Chain Monte Carlo","primary_cat":"eess.SY","submitted_at":"2026-05-09T19:15:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A homotopy-plus-MCMC data-generation pipeline trains a mass-conditioned diffusion model that yields 40% more feasible initial costates and a better Pareto front for multiobjective indirect low-thrust transfers than adjoint-control-transformation baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07861","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Synthetic to Real: Toward Identity-Consistent Makeup Transfer with Synthetic and Real Data","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:21:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The work creates identity-consistent synthetic makeup data via ConsistentBeauty and adapts models to real images using reinforcement learning in RealBeauty, achieving better identity preservation and real-world performance than prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06987","ref_index":97,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Response Time Enhances Alignment with Heterogeneous Preferences","primary_cat":"cs.LG","submitted_at":"2026-05-07T22:05:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Response times modeled as drift-diffusion processes enable consistent estimation of population-average preferences from heterogeneous anonymous binary choices.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04653","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Threshold-Guided Optimization for Visual Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-06T08:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A threshold-guided alignment method lets visual generative models be optimized directly from scalar human ratings instead of requiring paired preference data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02439","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Anomaly-Preference Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-04T10:37:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anomaly Preference Optimization reformulates anomalous image synthesis as preference learning with implicit alignment from real anomalies and a time-aware capacity allocation module for diffusion models to balance diversity and fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23632","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hallo-Live: Real-Time Streaming Joint Audio-Video Avatar Generation with Asynchronous Dual-Stream and Human-Centric Preference Distillation","primary_cat":"cs.CV","submitted_at":"2026-04-26T09:46:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Hallo-Live achieves 20.38 FPS real-time text-to-audio-video avatar generation with 0.94s latency using asynchronous dual-stream diffusion and HP-DMD preference distillation, matching teacher model quality at 16x higher throughput.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23380","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"V-GRPO: Online Reinforcement Learning for Denoising Generative Models Is Easier than You Think","primary_cat":"cs.LG","submitted_at":"2026-04-25T17:03:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"V-GRPO makes ELBO surrogates stable and efficient for online RL alignment of denoising models, delivering SOTA text-to-image performance with 2-3x speedups over MixGRPO and DiffusionNFT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19009","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Guiding Distribution Matching Distillation with Gradient-Based Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-21T02:57:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GDMD replaces raw-sample rewards with distillation-gradient rewards in RL-guided diffusion distillation, yielding 4-step models that surpass their multi-step teachers on GenEval and human preference metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18518","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UDM-GRPO: Stable and Efficient Group Relative Policy Optimization for Uniform Discrete Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-20T17:16:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UDM-GRPO is the first RL integration for uniform discrete diffusion models, using final clean samples as actions and forward-process trajectory reconstruction to raise GenEval accuracy from 69% to 96% and OCR accuracy from 8% to 57%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14379","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Step-level Denoising-time Diffusion Alignment with Multiple Objectives","primary_cat":"cs.LG","submitted_at":"2026-04-15T19:52:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MSDDA derives a closed-form optimal reverse denoising distribution for multi-objective diffusion alignment that is exactly equivalent to step-level RL fine-tuning with no approximation error.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06916","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FP4 Explore, BF16 Train: Diffusion Reinforcement Learning via Efficient Rollout Scaling","primary_cat":"cs.LG","submitted_at":"2026-04-08T10:14:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sol-RL decouples FP4-based candidate exploration from BF16 policy optimization in diffusion RL, delivering up to 4.64x faster convergence with maintained or superior alignment performance on models like FLUX.1 and SD3.5.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.16117","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DiffusionNFT: Online Diffusion Reinforcement with Forward Process","primary_cat":"cs.LG","submitted_at":"2025-09-19T16:09:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DiffusionNFT performs online RL for diffusion models on the forward process via flow matching and positive-negative contrasts, delivering up to 25x efficiency gains and rapid benchmark improvements over prior reverse-process methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.21802","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MixGRPO: Unlocking Flow-based GRPO Efficiency with Mixed ODE-SDE","primary_cat":"cs.AI","submitted_at":"2025-07-29T13:40:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MixGRPO speeds up GRPO for flow-based image generators by restricting SDE sampling and optimization to a sliding window while using ODE elsewhere, cutting training time by up to 71% with better alignment performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.05470","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Flow-GRPO: Training Flow Matching Models via Online RL","primary_cat":"cs.CV","submitted_at":"2025-05-08T17:58:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Flow-GRPO is the first online RL method for flow matching models, raising GenEval accuracy from 63% to 95% and text-rendering accuracy from 59% to 92% with little reward hacking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.05236","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unified Reward Model for Multimodal Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2025-03-07T08:36:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UnifiedReward is the first unified reward model that jointly assesses multimodal understanding and generation to provide better preference signals for aligning vision models via DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.13918","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Improving Video Generation with Human Feedback","primary_cat":"cs.CV","submitted_at":"2025-01-23T18:55:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A human preference dataset and VideoReward model enable Flow-DPO and Flow-NRG to produce smoother, better-aligned videos from text prompts in flow-based generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2306.09341","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Human Preference Score v2: A Solid Benchmark for Evaluating Human Preferences of Text-to-Image Synthesis","primary_cat":"cs.CV","submitted_at":"2023-06-15T17:59:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HPD v2 is the largest human preference dataset for text-to-image images with 798k choices, and HPS v2 is the resulting CLIP-based scorer that better predicts human judgments and responds to model improvements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.13301","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training Diffusion Models with Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2023-05-22T17:57:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DDPO uses policy gradients on the denoising process to optimize diffusion models for arbitrary rewards like human feedback or compressibility.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}