{"total":12,"items":[{"citing_arxiv_id":"2605.12480","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniNFT: Modality-wise Omni Diffusion Reinforcement for Joint Audio-Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:56:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniNFT introduces modality-wise advantage routing, layer-wise gradient surgery, and region-wise loss reweighting in an online diffusion RL framework to improve audio-video quality, alignment, and synchronization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12112","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Policy Entropy Constraint Fails: Preserving Diversity in Flow-based RLHF via Perceptual Entropy","primary_cat":"cs.CV","submitted_at":"2026-05-12T13:29:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Policy entropy remains constant in flow-matching models during RLHF due to fixed noise schedules while perceptual diversity collapses from mode-seeking policy gradients, so perceptual entropy constraints are introduced to preserve diversity and improve quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10937","ref_index":99,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Power Reinforcement Post-Training of Text-to-Image Models with Super-Linear Advantage Shaping","primary_cat":"cs.CV","submitted_at":"2026-05-11T17:59:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Super-Linear Advantage Shaping (SLAS) introduces a non-linear geometric policy update for RL post-training of text-to-image models that reshapes the local policy space via advantage-dependent Fisher-Rao weighting to reduce reward hacking and improve performance over GRPO baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25427","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Systematic Post-Train Framework for Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-28T09:34:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A post-training pipeline for video generation models combines SFT, RLHF with novel GRPO, prompt enhancement, and inference optimization to improve visual quality, temporal coherence, and instruction following.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23380","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"V-GRPO: Online Reinforcement Learning for Denoising Generative Models Is Easier than You Think","primary_cat":"cs.LG","submitted_at":"2026-04-25T17:03:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"V-GRPO makes ELBO surrogates stable and efficient for online RL alignment of denoising models, delivering SOTA text-to-image performance with 2-3x speedups over MixGRPO and DiffusionNFT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19406","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HP-Edit: A Human-Preference Post-Training Framework for Image Editing","primary_cat":"cs.CV","submitted_at":"2026-04-21T12:29:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HP-Edit introduces a post-training framework and RealPref-50K dataset that uses a VLM-based HP-Scorer to align diffusion image editing models with human preferences, improving outputs on Qwen-Image-Edit-2509.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19234","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning to Credit the Right Steps: Objective-aware Process Optimization for Visual Generation","primary_cat":"cs.CV","submitted_at":"2026-04-21T08:37:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OTCA improves GRPO training for visual generation by estimating step importance in trajectories and adaptively weighting multiple reward objectives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19009","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Guiding Distribution Matching Distillation with Gradient-Based Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-21T02:57:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GDMD replaces raw-sample rewards with distillation-gradient rewards in RL-guided diffusion distillation, yielding 4-step models that surpass their multi-step teachers on GenEval and human preference metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18518","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UDM-GRPO: Stable and Efficient Group Relative Policy Optimization for Uniform Discrete Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-20T17:16:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UDM-GRPO is the first RL integration for uniform discrete diffusion models, using final clean samples as actions and forward-process trajectory reconstruction to raise GenEval accuracy from 69% to 96% and OCR accuracy from 8% to 57%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14910","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reward-Aware Trajectory Shaping for Few-step Visual Generation","primary_cat":"cs.CV","submitted_at":"2026-04-16T11:52:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RATS lets few-step visual generators surpass multi-step teachers by shaping trajectories with reward-based adaptive guidance instead of strict imitation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06966","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MAR-GRPO: Stabilized GRPO for AR-diffusion Hybrid Image Generation","primary_cat":"cs.CV","submitted_at":"2026-04-08T11:30:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAR-GRPO stabilizes GRPO for AR-diffusion hybrids via multi-trajectory expectation and uncertainty-based token selection, yielding better visual quality, stability, and spatial understanding than baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06916","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FP4 Explore, BF16 Train: Diffusion Reinforcement Learning via Efficient Rollout Scaling","primary_cat":"cs.LG","submitted_at":"2026-04-08T10:14:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sol-RL decouples FP4-based candidate exploration from BF16 policy optimization in diffusion RL, delivering up to 4.64x faster convergence with maintained or superior alignment performance on models like FLUX.1 and SD3.5.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}