{"total":10,"items":[{"citing_arxiv_id":"2605.23244","ref_index":17,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Convex Optimization for Alignment and Preference Learning on a Single GPU","primary_cat":"cs.LG","submitted_at":"2026-05-22T05:25:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"COALA applies convex optimization reformulations of neural networks to direct preference optimization, claiming single-GPU training with ~18% of DPO's TFLOPs and competitive performance on multiple datasets and models up to 8B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21123","ref_index":7,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Linear-DPO: Linear Direct Preference Optimization for Diffusion and Flow-Matching Generative Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T12:54:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Linear-DPO replaces sigmoid utility with linear utility and adds EMA reference to improve preference alignment in diffusion and flow-matching text-to-image models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20834","ref_index":2,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Conditional Equivalence of DPO and RLHF: Implicit Assumption, Failure Modes, and Provable Alignment","primary_cat":"cs.AI","submitted_at":"2026-05-20T07:26:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DPO-RLHF equivalence holds only conditionally on the optimal policy preferring human-preferred responses; otherwise DPO optimizes relative advantage and can prefer worse outputs, addressed by introducing CPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11299","ref_index":17,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Primal Generation, Dual Judgment: Self-Training from Test-Time Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-11T22:34:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DuST self-trains LLMs for code generation by ranking their own test-time samples via sandbox execution and applying GRPO, improving judgment by +6.2 NDCG and single-sample pass@1 by +3.1 on LiveCodeBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07331","ref_index":19,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Rethinking Importance Sampling in LLM Policy Optimization: A Cumulative Token Perspective","primary_cat":"cs.LG","submitted_at":"2026-05-08T06:35:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The cumulative token IS ratio gives unbiased prefix correction and lower variance than full-sequence ratios for token-level gradients in LLM policy optimization, enabling CTPO to outperform GRPO and GSPO baselines on mathematical reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06650","ref_index":52,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Beyond Negative Rollouts: Positive-Only Policy Optimization with Implicit Negative Gradients","primary_cat":"cs.CL","submitted_at":"2026-05-07T17:55:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"POPO uses bounded importance sampling on positive rollouts and a siamese policy network to achieve implicit negative gradients and stable optimization, matching or exceeding GRPO on math benchmarks such as 36.67% on AIME 2025.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06165","ref_index":80,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Post Reasoning: Improving the Performance of Non-Thinking Models at No Cost","primary_cat":"cs.AI","submitted_at":"2026-05-07T12:51:49+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Post-Reasoning boosts LLM accuracy by reversing the usual answer-after-reasoning order, delivering mean relative gains of 17.37% across 117 model-benchmark pairs with zero extra cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04653","ref_index":26,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Threshold-Guided Optimization for Visual Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-06T08:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A threshold-guided alignment method lets visual generative models be optimized directly from scalar human ratings instead of requiring paired preference data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02971","ref_index":21,"ref_count":2,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Multilingual Safety Alignment via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-03T14:22:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSD enables cross-lingual safety transfer in LLMs via self-distillation with Dual-Perspective Safety Weighting, improving safety in low-resource languages without target response data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19016","ref_index":75,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"AlignCultura: Towards Culturally Aligned Large Language Models?","primary_cat":"cs.CL","submitted_at":"2026-04-21T03:06:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Align-Cultura introduces the CULTURAX dataset and shows that culturally fine-tuned LLMs improve joint HHH scores by 4-6%, cut cultural failures by 18%, and gain 10-12% efficiency with minimal leakage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}