{"total":25,"items":[{"citing_arxiv_id":"2605.23244","ref_index":45,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Convex Optimization for Alignment and Preference Learning on a Single GPU","primary_cat":"cs.LG","submitted_at":"2026-05-22T05:25:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"COALA applies convex optimization reformulations of neural networks to direct preference optimization, claiming single-GPU training with ~18% of DPO's TFLOPs and competitive performance on multiple datasets and models up to 8B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17694","ref_index":55,"ref_count":2,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Do LLM Agents Mirror Socio-Cognitive Effects in Power-Asymmetric Conversations?","primary_cat":"cs.CL","submitted_at":"2026-05-17T23:23:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs assigned high or low status personas in multi-turn dialogues exhibit socio-cognitive effects including language coordination, pronoun patterns, persuasion success, and compliance with unsafe requests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10564","ref_index":95,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"DeepSight: Long-Horizon World Modeling via Latent States Prediction for End-to-End Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-11T13:36:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DeepSight uses parallel latent feature prediction in BEV for long-horizon world modeling and adaptive text reasoning to reach state-of-the-art closed-loop performance on the Bench2drive benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09291","ref_index":135,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"dFlowGRPO: Rate-Aware Policy Optimization for Discrete Flow Models","primary_cat":"cs.LG","submitted_at":"2026-05-10T03:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dFlowGRPO is a new rate-aware RL method for discrete flow models that outperforms prior GRPO approaches on image generation and matches continuous flow models while supporting broad probability paths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09214","ref_index":39,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Fast Rates for Offline Contextual Bandits with Forward-KL Regularization under Single-Policy Concentrability","primary_cat":"cs.LG","submitted_at":"2026-05-09T23:17:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper establishes the first tilde O(epsilon^{-1}) upper bounds and matching lower bounds for forward-KL-regularized offline contextual bandits under single-policy concentrability in both tabular and general function approximation settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13875","ref_index":93,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Common-agency Games for Multi-Objective Test-Time Alignment","primary_cat":"cs.GT","submitted_at":"2026-05-08T06:56:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAGE uses common-agency games and an EPEC algorithm to compute equilibrium policies that balance multiple conflicting objectives for test-time LLM alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07331","ref_index":16,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Rethinking Importance Sampling in LLM Policy Optimization: A Cumulative Token Perspective","primary_cat":"cs.LG","submitted_at":"2026-05-08T06:35:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The cumulative token IS ratio gives unbiased prefix correction and lower variance than full-sequence ratios for token-level gradients in LLM policy optimization, enabling CTPO to outperform GRPO and GSPO baselines on mathematical reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05710","ref_index":104,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"On the Blessing of Pre-training in Weak-to-Strong Generalization","primary_cat":"cs.LG","submitted_at":"2026-05-07T05:55:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pre-training provides a geometric warm start in a single-index model that enables weak-to-strong generalization up to a supervisor-limited bound, with empirical phase-transition evidence in LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05365","ref_index":55,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"ZAYA1-8B Technical Report","primary_cat":"cs.AI","submitted_at":"2026-05-06T18:44:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZAYA1-8B is a reasoning MoE model with 700M active parameters that matches larger models on math and coding benchmarks and reaches 91.9% on AIME'25 via Markovian RSA test-time compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02200","ref_index":43,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"ARGUS: Policy-Adaptive Ad Governance via Evolving Reinforcement with Adversarial Umpiring","primary_cat":"cs.CL","submitted_at":"2026-05-04T03:58:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ARGUS uses a Prosecutor-Defender-Umpire multi-agent setup plus RAG and chain-of-thought rewards to adapt ad policy enforcement to new regulations using minimal fresh labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17396","ref_index":152,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Representation-Guided Parameter-Efficient LLM Unlearning","primary_cat":"cs.CL","submitted_at":"2026-04-19T11:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"REGLU guides LoRA-based unlearning via representation subspaces and orthogonal regularization to outperform prior methods on forget-retain trade-off in LLM benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17200","ref_index":17,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Calibrating Model-Based Evaluation Metrics for Summarization","primary_cat":"cs.CL","submitted_at":"2026-04-19T02:04:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A reference-free proxy scoring framework combined with GIRB calibration produces better-aligned evaluation metrics for summarization and outperforms baselines across seven datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17197","ref_index":12,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Learning to Control Summaries with Score Ranking","primary_cat":"cs.CL","submitted_at":"2026-04-19T01:58:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A score-ranking loss enables controllable summarization by aligning outputs to evaluation scores, matching SOTA performance with dimension-specific control on LLaMA, Qwen, and Mistral.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04066","ref_index":102,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Adapt to Thrive! Adaptive Power-Mean Policy Optimization for Improved LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-04-11T07:34:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"APMPO boosts average Pass@1 scores on math reasoning benchmarks by 3 points over GRPO by using an adaptive power-mean policy objective and feedback-driven clipping bounds in RLVR training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04065","ref_index":117,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Free Energy-Driven Reinforcement Learning with Adaptive Advantage Shaping for Unsupervised Reasoning in LLMs","primary_cat":"cs.CL","submitted_at":"2026-04-11T07:26:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FREIA applies free energy principles and adaptive advantage shaping to unsupervised RL, outperforming baselines by 0.5-3.5 Pass@1 points on math reasoning with a 1.5B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02912","ref_index":59,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Reasoning-Guided Grounding: Elevating Video Anomaly Detection through Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-07T20:15:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VANGUARD is a staged-training VLM framework that reports 94% ROC-AUC and 84% F1 on UCF-Crime while adding chain-of-thought reasoning and spatial grounding to video anomaly detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.21046","ref_index":229,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"A Survey of Self-Evolving Agents: What, When, How, and Where to Evolve on the Path to Artificial Super Intelligence","primary_cat":"cs.AI","submitted_at":"2025-07-28T17:59:05+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper delivers the first systematic review of self-evolving agents, structured around what components evolve, when adaptation occurs, and how it is implemented.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.16982","ref_index":65,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Muon is Scalable for LLM Training","primary_cat":"cs.LG","submitted_at":"2025-02-24T09:12:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Muon optimizer with weight decay and update scaling achieves ~2x efficiency over AdamW for large LLMs, shown via the Moonlight 3B/16B MoE model trained on 5.7T tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.03387","ref_index":80,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"LIMO: Less is More for Reasoning","primary_cat":"cs.CL","submitted_at":"2025-02-05T17:23:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LIMO achieves 63.3% on AIME24 and 95.6% on MATH500 via supervised fine-tuning on roughly 1% of the data used by prior models, supporting the claim that minimal strategic examples suffice when pre-training has already encoded domain knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.02737","ref_index":72,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model","primary_cat":"cs.CL","submitted_at":"2025-02-04T21:43:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SmolLM2 is a 1.7B-parameter language model that outperforms Qwen2.5-1.5B and Llama3.2-1B after overtraining on 11 trillion tokens using custom FineMath, Stack-Edu, and SmolTalk datasets in a multi-stage pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.01456","ref_index":112,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Process Reinforcement through Implicit Rewards","primary_cat":"cs.LG","submitted_at":"2025-02-03T15:43:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRIME enables online process reward model updates in LLM RL using implicit rewards from rollouts and outcome labels, yielding 15.1% average gains on reasoning benchmarks and surpassing a stronger instruct model with 10% of the data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.21187","ref_index":266,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs","primary_cat":"cs.CL","submitted_at":"2024-12-30T18:55:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"o1-like models overthink easy tasks; self-training reduces compute use without accuracy loss on GSM8K, MATH500, GPQA, and AIME.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2409.12917","ref_index":92,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Training Language Models to Self-Correct via Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2024-09-19T17:16:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCoRe uses multi-turn online RL with regularization on self-generated traces to improve LLM self-correction, achieving 15.6% and 9.1% gains on MATH and HumanEval for Gemini models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.08435","ref_index":86,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Automated Design of Agentic Systems","primary_cat":"cs.AI","submitted_at":"2024-08-15T21:59:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Meta Agent Search uses a meta-agent to iteratively program novel agentic systems in code, producing agents that outperform state-of-the-art hand-designed ones across coding, science, and math while transferring across domains and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.00267","ref_index":124,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback","primary_cat":"cs.CL","submitted_at":"2023-09-01T05:53:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RLAIF matches RLHF on summarization and dialogue tasks, with a direct-RLAIF variant achieving superior results by using LLM rewards directly during training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}