{"total":12,"items":[{"citing_arxiv_id":"2605.12288","ref_index":106,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TokenRatio: Principled Token-Level Preference Optimization via Ratio Matching","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:44:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TBPO derives a token-level preference optimization objective from sequence-level pairwise data via Bregman divergence ratio matching that generalizes DPO and improves alignment quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09433","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Offline Preference Optimization for Rectified Flow with Noise-Tracked Pairs","primary_cat":"cs.CV","submitted_at":"2026-05-10T09:13:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PNAPO augments preference data with prior noise pairs and uses straight-line interpolation to create a tighter surrogate objective for offline alignment of rectified flow models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09079","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CauSim: Scaling Causal Reasoning with Increasingly Complex Causal Simulators","primary_cat":"cs.AI","submitted_at":"2026-05-09T17:39:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CauSim turns scarce causal reasoning labels into scalable supervised data by having LLMs incrementally construct complex executable structural causal models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06987","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Response Time Enhances Alignment with Heterogeneous Preferences","primary_cat":"cs.LG","submitted_at":"2026-05-07T22:05:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Response times modeled as drift-diffusion processes enable consistent estimation of population-average preferences from heterogeneous anonymous binary choices.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04559","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Static Best-of-N: Bayesian List-wise Alignment for LLM-based Recommendation","primary_cat":"cs.IR","submitted_at":"2026-05-06T07:02:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BLADE uses Bayesian list-wise alignment with dynamic estimation to create a self-evolving target that overcomes limitations of static references in LLM-based recommendation, yielding sustained gains in ranking and complex metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01474","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReMedi: Reasoner for Medical Clinical Prediction","primary_cat":"cs.CL","submitted_at":"2026-05-02T14:44:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReMedi boosts LLM performance on EHR clinical predictions by up to 19.9% F1 through ground-truth-guided rationale regeneration and fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19016","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AlignCultura: Towards Culturally Aligned Large Language Models?","primary_cat":"cs.CL","submitted_at":"2026-04-21T03:06:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Align-Cultura introduces the CULTURAX dataset and shows that culturally fine-tuned LLMs improve joint HHH scores by 4-6%, cut cultural failures by 18%, and gain 10-12% efficiency with minimal leakage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13305","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bias at the End of the Score","primary_cat":"cs.CV","submitted_at":"2026-04-14T21:20:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reward models used as quality scorers in text-to-image generation encode demographic biases that cause reward-guided training to sexualize female subjects, reinforce stereotypes, and reduce diversity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05341","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Curr-RLCER:Curriculum Reinforcement Learning For Coherence Explainable Recommendation","primary_cat":"cs.IR","submitted_at":"2026-04-07T02:25:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Curr-RLCER applies curriculum reinforcement learning with coherence-driven rewards to align generated explanations with predicted ratings in explainable recommendation systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.05470","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Flow-GRPO: Training Flow Matching Models via Online RL","primary_cat":"cs.CV","submitted_at":"2025-05-08T17:58:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Flow-GRPO is the first online RL method for flow matching models, raising GenEval accuracy from 63% to 95% and text-rendering accuracy from 59% to 92% with little reward hacking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.13918","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Improving Video Generation with Human Feedback","primary_cat":"cs.CV","submitted_at":"2025-01-23T18:55:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A human preference dataset and VideoReward model enable Flow-DPO and Flow-NRG to produce smoother, better-aligned videos from text prompts in flow-based generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.08998","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reinforced Self-Training (ReST) for Language Modeling","primary_cat":"cs.CL","submitted_at":"2023-08-17T14:12:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReST improves LLM translation quality on benchmarks via offline RL on self-generated data, achieving gains in a compute-efficient way compared to typical RLHF.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}