{"total":19,"items":[{"citing_arxiv_id":"2606.30248","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Your Data Manifold is Secretly a Reward Model: Shell-LCC for Text-to-Video Generation","primary_cat":"cs.CV","submitted_at":"2026-06-29T12:57:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Shell-LCC models the high-quality data manifold as an isotropic shell to derive cost-free reward signals that improve realism and high-frequency details in text-to-video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21883","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Token-weighted Direct Preference Optimization with Attention","primary_cat":"cs.CL","submitted_at":"2026-05-21T01:43:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AttentionPO weights tokens in DPO using LLM attention as a pairwise judge, yielding better results on AlpacaEval, MT-Bench, and ArenaHard than prior preference optimization methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20834","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Conditional Equivalence of DPO and RLHF: Implicit Assumption, Failure Modes, and Provable Alignment","primary_cat":"cs.AI","submitted_at":"2026-05-20T07:26:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DPO-RLHF equivalence holds only conditionally on the optimal policy preferring human-preferred responses; otherwise DPO optimizes relative advantage and can prefer worse outputs, addressed by introducing CPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12545","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CROP: Expert-Aligned Image Cropping via Compositional Reasoning and Optimizing Preference","primary_cat":"cs.CV","submitted_at":"2026-05-09T10:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CROP uses compositional reasoning and expert preference alignment in VLMs to produce aesthetic crops that match human experts more closely than previous methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04477","ref_index":94,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Data-dependent Exploration for Online Reinforcement Learning from Human Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-06T03:56:45+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02439","ref_index":24,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Anomaly-Preference Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-04T10:37:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27733","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mind the Gap: Structure-Aware Consistency in Preference Learning","primary_cat":"cs.LG","submitted_at":"2026-04-30T11:24:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Standard DPO surrogates are inconsistent for equicontinuous neural nets; SA-DPO provides structure-aware H-consistency bounds by adapting margins to semantic distance and shows heavy-tailed losses yield superior guarantees for capacity-bounded models via the Margin-Capacity Profile.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17396","ref_index":209,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Representation-Guided Parameter-Efficient LLM Unlearning","primary_cat":"cs.CL","submitted_at":"2026-04-19T11:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"REGLU guides LoRA-based unlearning via representation subspaces and orthogonal regularization to outperform prior methods on forget-retain trade-off in LLM benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.17881","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"POPI: Personalizing LLMs via Optimized Natural Language Preference Inference","primary_cat":"cs.CL","submitted_at":"2025-10-17T23:07:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"POPI distills user preferences into reusable natural-language summaries via a shared inference model and conditions a generator on them, trained jointly with RL to improve personalization quality while cutting context length by up to 10x on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.20265","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Failure Modes of Maximum Entropy RLHF","primary_cat":"cs.LG","submitted_at":"2025-09-24T15:52:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Derives SimPO from MaxEnt RL and reports that MaxEnt RL in online RLHF exhibits frequent overoptimization and unstable KL dynamics across scales, unlike stable KL-constrained baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.02850","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM Hypnosis: Exploiting User Feedback for Unauthorized Knowledge Injection to All Users","primary_cat":"cs.CL","submitted_at":"2025-07-03T17:55:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A single attacker can use strategic upvoting and downvoting on language model outputs to inject facts, security flaws, or fake news that persist in the model for all users after preference tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.19134","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Incentivizing High-Quality Human Annotations with Golden Questions","primary_cat":"cs.GT","submitted_at":"2025-05-25T13:11:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper derives a Θ(1/√(n log n)) hypothesis testing rate under strategic annotator behavior and shows that high-certainty, format-similar golden questions better reveal annotation quality than standard checks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.12501","ref_index":193,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reinforcement Learning from Human Feedback","primary_cat":"cs.LG","submitted_at":"2025-04-16T21:36:46+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.06387","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How Humans Help LLMs: Assessing and Incentivizing Human Preference Annotators","primary_cat":"cs.LG","submitted_at":"2025-02-10T12:15:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Develops self-consistency monitoring for preference annotators and derives sample-complexity bounds showing linear contracts achieve near-ideal performance faster than binary ones under continuous actions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.02125","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Preference Goal Tuning: Post-Training as Latent Control for Frozen Policies","primary_cat":"cs.AI","submitted_at":"2024-12-03T03:27:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PGT optimizes latent goal embeddings for frozen policies via trajectory-level preference objectives, reporting 72-81.6% relative gains on 17 Minecraft tasks and 13.4% better OOD performance than fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.07199","ref_index":210,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents","primary_cat":"cs.AI","submitted_at":"2024-08-13T20:52:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agent Q integrates MCTS-guided search, self-critique, and off-policy DPO to train LLM agents that outperform behavior cloning and reinforced fine-tuning baselines in WebShop and achieve up to 95.4% success in real-world booking scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.00724","ref_index":237,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models","primary_cat":"cs.AI","submitted_at":"2024-08-01T17:16:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical analysis shows scaling inference compute via strategies like tree search can be more efficient than scaling model parameters, with 7B models plus novel search outperforming 34B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.01306","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"KTO: Model Alignment as Prospect Theoretic Optimization","primary_cat":"cs.LG","submitted_at":"2024-02-02T10:53:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KTO aligns LLMs by directly maximizing prospect-theoretic utility on binary signals and matches or exceeds preference-based methods like DPO from 1B to 30B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.10020","ref_index":124,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Self-Rewarding Language Models","primary_cat":"cs.CL","submitted_at":"2024-01-18T14:43:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Iterative self-rewarding via LLM-as-Judge in DPO training on Llama 2 70B improves instruction following and self-evaluation, outperforming GPT-4 on AlpacaEval 2.0.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}