{"total":13,"items":[{"citing_arxiv_id":"2607.01392","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Multi-Objective Exploration and Preference Optimization via Mutual Information","primary_cat":"cs.CL","submitted_at":"2026-07-01T18:50:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MI-EPO maximizes joint conditional mutual information among responses, feedback, and preference vectors, using probabilistic routing to improve alignment and controllability in multi-objective LLM optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10587","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Diverse Scientific Hypothesis Search with Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-06-09T08:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A parallel-tempering evolutionary framework for LLM hypothesis search improves both quality and diversity of candidates in molecular, equation, and algorithm discovery under fixed validation budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23346","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Contrastive Distribution Matching for Amortized Sequential Monte Carlo in Discrete Diffusion","primary_cat":"cs.LG","submitted_at":"2026-05-22T08:06:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CDM amortizes SMC inference for reward-tilted discrete diffusion by training a parameterized twist function on contrastive samples with closed-form kernels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20408","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spectral Souping: A Unified Framework for Online Preference Alignment","primary_cat":"cs.LG","submitted_at":"2026-05-19T19:04:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Spectral Souping learns offline specialized policies for fine-grained preferences and merges them online using a discovered universal spectral representation for efficient LLM alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14746","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Selective Safety Steering via Value-Filtered Decoding","primary_cat":"cs.LG","submitted_at":"2026-05-14T12:13:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Value-filtered decoding steers LLM outputs for safety at decoding time using a value criterion with an explicit bound on false interventions controlled by one threshold hyperparameter.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12288","ref_index":152,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TokenRatio: Principled Token-Level Preference Optimization via Ratio Matching","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:44:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces TBPO, which derives a Bregman-divergence density-ratio matching objective for token-level preference optimization that generalizes DPO while preserving the induced optimal policy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10843","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Training-Free Cultural Alignment of Large Language Models via Persona Disagreement","primary_cat":"cs.CL","submitted_at":"2026-05-11T16:55:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DISCA converts within-country disagreement among World Values Survey personas into a bounded logit correction that reduces cultural misalignment by 10-24% on MultiTP for models 3.8B and larger across 20 countries, without any weight updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15306","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Generalization in LLM Problem Solving: The Case of the Shortest Path","primary_cat":"cs.AI","submitted_at":"2026-04-16T17:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs show strong spatial generalization to unseen maps in shortest-path tasks but fail length scaling due to recursive instability, with data coverage setting hard limits.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.06419","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Teach a Reward Model to Correct Itself: Reward Guided Adversarial Failure Discovery for Robust Reward Modeling","primary_cat":"cs.CL","submitted_at":"2025-07-08T21:56:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.19075","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Universal Reasoner: A Single, Composable Plug-and-Play Reasoner for Frozen LLMs","primary_cat":"cs.AI","submitted_at":"2025-05-25T10:19:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniR is a composable reasoning module trained with verifiable rewards and added to frozen LLMs via logit summation, enabling modular composition and weak-to-strong generalization across tasks and model sizes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.16559","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Synergistic Benefits of Joint Molecule Generation and Property Prediction","primary_cat":"cs.LG","submitted_at":"2025-04-23T09:36:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Hyformer jointly models molecule generation and property prediction via alternating attention and joint pre-training, showing synergistic gains in conditional sampling, OOD prediction, and a drug design case for antimicrobial peptides.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.08812","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TRAM: Test-Time Risk Adaptation with Mixture of Agents","primary_cat":"cs.LG","submitted_at":"2024-08-16T15:47:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRAM is a test-time mixture method that scores and composes risk-neutral source policies using reward and occupancy-based risk to achieve new reward-risk tradeoffs without parameter updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.07199","ref_index":221,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents","primary_cat":"cs.AI","submitted_at":"2024-08-13T20:52:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agent Q integrates MCTS-guided search, self-critique, and off-policy DPO to train LLM agents that outperform behavior cloning and reinforced fine-tuning baselines in WebShop and achieve up to 95.4% success in real-world booking scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}