{"total":97,"items":[{"citing_arxiv_id":"2605.13641","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Objective and Mixed-Reward Reinforcement Learning via Reward-Decorrelated Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-13T15:05:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RDPO applies magnitude-aware quantile normalization and Mahalanobis whitening to decorrelate heterogeneous rewards in multi-objective RL, improving instruction following and writing quality on LongCat-Flash post-training while staying competitive on reasoning and coding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13330","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FIND: Toward Multimodal Financial Reasoning and Question Answering for Indic Languages","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:45:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FinVQA is a new multilingual benchmark for Indic financial VQA with three difficulty levels and four formats, paired with the FIND framework for faithful numerical reasoning via fine-tuning and constrained decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13230","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Teacher-Guided Policy Optimization for LLM Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-13T09:20:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TGPO improves on-policy LLM distillation by using teacher predictions conditioned on student rollouts to supply informative guidance when the two distributions diverge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13165","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"STOP: Structured On-Policy Pruning of Long-Form Reasoning in Low-Data Regimes","primary_cat":"cs.CL","submitted_at":"2026-05-13T08:28:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"STOP uses structured on-policy analysis to prune long reasoning traces to their earliest correct node, cutting token usage 19-42% with little accuracy loss on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13080","ref_index":71,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning to See What You Need: Gaze Attention for Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-13T06:54:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Gaze Attention groups visual embeddings into selectable regions and dynamically restricts attention to task-relevant ones, matching dense baselines with up to 90% fewer visual KV entries via added context tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11625","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nice Fold or Hero Call: Learning Budget-Efficient Thinking for Adaptive Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-12T06:51:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BET reduces reasoning tokens by about 55% on average while improving performance across benchmarks by learning to short-solve easy queries, fold early on unsolvable ones, and preserve budget for hard solvable queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11609","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Anti-Self-Distillation for Reasoning RL via Pointwise Mutual Information","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anti-Self-Distillation reverses self-distillation signals via PMI to fix overconfidence on structural tokens, matching GRPO baseline accuracy 2-10x faster with up to 11.5 point gains across 4B-30B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11518","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AutoLLMResearch: Training Research Agents for Automating LLM Experiment Configuration -- Learning from Cheap, Optimizing Expensive","primary_cat":"cs.AI","submitted_at":"2026-05-12T04:42:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AutoLLMResearch trains agents via a multi-fidelity environment and MDP pipeline to extrapolate configuration principles from inexpensive to costly LLM experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11491","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding and Preventing Entropy Collapse in RLVR with On-Policy Entropy Flow Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-12T04:08:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPEFO prevents entropy collapse in RLVR by rescaling token updates according to their entropy change contributions, yielding more stable optimization and better results on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11467","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Drop the Act: Probe-Filtered RL for Faithful Chain-of-Thought Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-12T03:30:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProFIL trains an activation probe on a frozen base model to zero advantages on theatrical post-commitment rollouts in GRPO, cutting theater 11-100%, raising faithful fractions, and shortening chains 4-19% without accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10158","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unsupervised Process Reward Models","primary_cat":"cs.LG","submitted_at":"2026-05-11T08:05:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Unsupervised PRMs derived from LLM probabilities achieve up to 15% better error detection than LLM judges and match supervised PRMs in verification and RL tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09931","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PruneTIR: Inference-Time Tool Call Pruning for Effective yet Efficient Tool-Integrated Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-11T03:28:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PruneTIR prunes erroneous tool-call trajectories during LLM inference via three trigger-based components to raise Pass@1 accuracy and efficiency while shortening context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09806","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LEAD: Length-Efficient Adaptive and Dynamic Reasoning for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-10T23:05:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LEAD uses online adaptive mechanisms including Potential-Scaled Instability and symmetric efficiency rewards based on correct rollouts to achieve higher accuracy-efficiency scores with substantially shorter reasoning outputs than base models on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09640","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Overcoming Catastrophic Forgetting in Visual Continual Learning with Reinforcement Fine-Tuning","primary_cat":"cs.CV","submitted_at":"2026-05-10T16:36:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RaPO reduces catastrophic forgetting in visual continual learning by shaping rewards around policy drift and stabilizing advantages with cross-task exponential moving averages during reinforcement fine-tuning of multimodal models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09271","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Shaping Schema via Language Representation as the Next Frontier for LLM Intelligence Expanding","primary_cat":"cs.AI","submitted_at":"2026-05-10T02:42:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Advanced language representations shape LLMs' schemas to improve knowledge activation and problem-solving.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al. Openai o1 system card. arXiv preprint arXiv:2412.16720, 2024. [5] Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.arXiv preprint arXiv:2501.12948, 2025. [6] Kimi Team, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al. Kimi k1. 5: Scaling reinforcement learning with llms.arXiv preprint arXiv:2501.12599, 2025. [7] Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, et al."},{"citing_arxiv_id":"2605.09146","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Thinking: Imagining in 360$^\\circ$ for Humanoid Visual Search","primary_cat":"cs.CV","submitted_at":"2026-05-09T20:10:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Imagining in 360° decouples visual search into a single-step probabilistic semantic layout predictor and an actor, removing the need for multi-turn CoT reasoning and trajectory annotations while improving efficiency in 360° environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08905","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Forge: Quality-Aware Reinforcement Learning for NP-Hard Optimization in LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-09T11:57:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPT-BENCH trains LLMs on NP-hard optimization via quality-aware RLVR, achieving 93.1% success rate and 46.6% quality ratio on Qwen2.5-7B while outperforming GPT-4o and transferring gains to other domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08862","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BubbleSpec: Turning Long-Tail Bubbles into Speculative Rollout Drafts for Synchronous Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-09T10:21:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BubbleSpec exploits long-tail bubbles in synchronous RL by using faster ranks' idle time to pre-generate rollout drafts for speculative decoding, reducing steps by 50% and raising throughput up to 1.8x while preserving exact synchrony.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08721","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Breaking the Impasse: Dual-Scale Evolutionary Policy Training for Social Language Agents","primary_cat":"cs.CL","submitted_at":"2026-05-09T06:10:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DEPT detects training impasses in social language agents via dual-scale divergence and entropy, then uses asymmetric reshaping to restore exploration gradients and prevent policy homogenization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08678","ref_index":95,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MLS-Bench: A Holistic and Rigorous Assessment of AI Systems on Building Better AI","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:29:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MLS-Bench shows that current AI agents fall short of reliably inventing generalizable ML methods, with engineering tuning easier than genuine invention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08665","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hint Tuning: Less Data Makes Better Reasoners","primary_cat":"cs.CL","submitted_at":"2026-05-09T04:07:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hint Tuning uses an instruct model as a difficulty probe to create 1K multi-level hint examples that train reasoning models to calibrate chain-of-thought length, cutting tokens by 31.5% on average across 4B-32B models without accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08639","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReLibra: Routing-Replay-Guided Load Balancing for MoE Training in Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-09T03:18:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ReLibra uses pre-known token-to-expert routing from RL rollouts to perform inter-batch expert reordering and intra-batch replication, delivering up to 1.6x higher throughput than Megatron-LM and 1.2x over oracle-equipped EPLB while staying within 6-10% of an ideal balanced baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08401","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AIPO: : Learning to Reason from Active Interaction","primary_cat":"cs.CL","submitted_at":"2026-05-08T19:06:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AIPO trains LLMs to expand their reasoning capability boundary via active multi-agent interaction with Verify, Knowledge, and Reasoning agents during RLVR, using importance sampling and clipping to handle feedback, then drops the agents at inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07865","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KL for a KL: On-Policy Distillation with Control Variate Baseline","primary_cat":"cs.LG","submitted_at":"2026-05-08T15:24:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"vOPD stabilizes on-policy distillation gradients by subtracting a closed-form per-token negative reverse KL baseline as a detached control variate, preserving unbiasedness while lowering variance and matching expensive full-vocabulary methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08283","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HTPO: Towards Exploration-Exploitation Balanced Policy Optimization via Hierarchical Token-level Objective Control","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:38:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HTPO introduces hierarchical token-level objective control in RLVR to balance exploration and exploitation by grouping tokens according to difficulty, correctness, and entropy, yielding up to 8.6% gains on AIME benchmarks over DAPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07331","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Importance Sampling in LLM Policy Optimization: A Cumulative Token Perspective","primary_cat":"cs.LG","submitted_at":"2026-05-08T06:35:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The cumulative token IS ratio gives unbiased prefix correction and lower variance than full-sequence ratios for token-level gradients in LLM policy optimization, enabling CTPO to outperform GRPO and GSPO baselines on mathematical reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07316","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Implicit Compression Regularization: Concise Reasoning via Internal Shorter Distributions in RL Post-Training","primary_cat":"cs.AI","submitted_at":"2026-05-08T06:25:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ICR creates a virtual shorter distribution from shortest correct on-policy responses to regularize RL post-training toward concise yet accurate reasoning, improving the accuracy-length Pareto frontier on math and knowledge benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07137","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adaptive Negative Reinforcement for LLM Reasoning:Dynamically Balancing Correction and Diversity in RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-08T02:13:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Adaptive scheduling of penalties over training time plus confidence-based weighting of mistakes improves LLM performance on math reasoning benchmarks compared to fixed-penalty negative reinforcement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06554","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Long Context Pre-Training with Lighthouse Attention","primary_cat":"cs.CL","submitted_at":"2026-05-07T16:49:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Lighthouse Attention enables faster long-context pre-training via gradient-free symmetrical hierarchical compression of QKV while preserving causality, followed by a short full-attention recovery that yields lower loss than standard full-attention training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06230","ref_index":58,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Safactory: A Scalable Agentic Infrastructure for Training Trustworthy Autonomous Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-07T13:21:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Safactory integrates three platforms for simulation, data management, and agent evolution to create a unified pipeline for training trustworthy autonomous AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06165","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post Reasoning: Improving the Performance of Non-Thinking Models at No Cost","primary_cat":"cs.AI","submitted_at":"2026-05-07T12:51:49+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Post-Reasoning boosts LLM accuracy by reversing the usual answer-after-reasoning order, delivering mean relative gains of 17.37% across 117 model-benchmark pairs with zero extra cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06111","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Schedule-and-Calibrate: Utility-Guided Multi-Task Reinforcement Learning for Code LLMs","primary_cat":"cs.SE","submitted_at":"2026-05-07T12:24:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ASTOR improves a single code LLM across four tasks by 9.0-9.5% over the best specialist and 7.5-12.8% over prior multi-task RL baselines via utility-driven data scheduling and adaptive KL regularization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05049","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Piper: Efficient Large-Scale MoE Training via Resource Modeling and Pipelined Hybrid Parallelism","primary_cat":"cs.DC","submitted_at":"2026-05-06T15:47:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Piper introduces resource modeling and pipelined hybrid parallelism for MoE training, delivering 2-3.5X higher MFU than prior frameworks and 1.2-9X better all-to-all bandwidth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02469","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reference-Sampled Boltzmann Projection for KL-Regularized RLVR: Target-Matched Weighted SFT, Finite One-Shot Gaps, and Policy Mirror Descent","primary_cat":"cs.LG","submitted_at":"2026-05-04T11:10:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reference-sampled weighted SFT with prompt-normalized Boltzmann weights induces the same policy as fixed-reference KL-regularized RLVR, with BOLT as the estimator and a finite one-shot error decomposition separating coverage, variance, and other terms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02178","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"T$^2$PO: Uncertainty-Guided Exploration Control for Stable Multi-Turn Agentic Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-04T03:15:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"T²PO improves stability and performance in multi-turn agentic RL by using uncertainty dynamics at token and turn levels to guide exploration and avoid wasted rollouts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00365","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Uniform-Correct Policy Optimization: Breaking RLVR's Indifference to Diversity","primary_cat":"cs.LG","submitted_at":"2026-05-01T03:02:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UCPO modifies GRPO with a uniformity penalty over correct solutions to prevent diversity collapse in RLVR, yielding up to 10% higher Pass@64 on AIME24 and 45% more equation-level diversity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00072","ref_index":78,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"XekRung Technical Report","primary_cat":"cs.CR","submitted_at":"2026-04-30T11:50:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"XekRung achieves state-of-the-art performance on cybersecurity benchmarks among same-scale models via tailored data synthesis and multi-stage training while retaining strong general capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27393","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MiniCPM-o 4.5: Towards Real-Time Full-Duplex Omni-Modal Interaction","primary_cat":"cs.CL","submitted_at":"2026-04-30T04:05:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MiniCPM-o 4.5 uses the Omni-Flow streaming framework to deliver real-time full-duplex omni-modal interaction with proactive behavior in a 9B model that approaches Gemini 2.5 Flash performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27039","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Length Value Model: Scalable Value Pretraining for Token-Level Length Modeling","primary_cat":"cs.CL","submitted_at":"2026-04-29T17:09:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LenVM models token-level remaining generation length as a bounded discounted value function derived from constant negative per-token rewards, providing a scalable proxy for generation horizon.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26256","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DORA: A Scalable Asynchronous Reinforcement Learning System for Language Model Training","primary_cat":"cs.LG","submitted_at":"2026-04-29T03:25:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DORA's multi-version streaming rollout enables 2-3x higher throughput in asynchronous RL for LLMs while preserving convergence by maintaining policy consistency, data integrity, and bounded staleness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25098","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Doing More With Less: Revisiting the Effectiveness of LLM Pruning for Test-Time Scaling","primary_cat":"cs.AI","submitted_at":"2026-04-28T01:04:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Unstructured pruning augments test-time scaling reasoning performance in LLMs and can outperform the unpruned model on benchmarks, contrary to expectations from structured pruning studies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25011","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Does Reinforcement Learning Generalize? A Feature-Level Mechanistic Study of Post-Training in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-27T21:22:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RL generalizes better than SFT by preserving and slowly evolving a compact set of task-agnostic features from the base model rather than introducing many specialized ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24953","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ViPO: Visual Preference Optimization at Scale","primary_cat":"cs.CV","submitted_at":"2026-04-27T19:49:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Poly-DPO improves robustness to noisy preference data in visual models, and the new ViPO dataset enables superior performance, with the method reducing to standard DPO on high-quality data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24339","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"See Further, Think Deeper: Advancing VLM's Reasoning Ability with Low-level Visual Cues and Reflection","primary_cat":"cs.CV","submitted_at":"2026-04-27T11:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ForeSight lets VLMs use low-level visual cues and mask-based visual feedback within an RL loop to reason more accurately, with the 7B model beating same-scale peers and some closed-source SOTA on a new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24003","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stabilizing Efficient Reasoning with Step-Level Advantage Selection","primary_cat":"cs.CL","submitted_at":"2026-04-27T03:34:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SAS stabilizes efficient LLM reasoning by step-level advantage masking, improving Pass@1 accuracy by 0.86 points and cutting reasoning length by 16.3% versus length-aware baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23747","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SFT-then-RL Outperforms Mixed-Policy Methods for LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-04-26T14:53:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Correcting DeepSpeed optimizer and OpenRLHF loss bugs reveals SFT-then-RL outperforms mixed-policy methods by 3.8-22.2 points on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21057","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TRACES: Tagging Reasoning Steps for Adaptive Cost-Efficient Early-Stopping","primary_cat":"cs.CL","submitted_at":"2026-04-22T20:00:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRACES tags reasoning steps to enable adaptive early stopping, cutting token use by 20-50% on MATH500, GSM8K, AIME, MMLU and GPQA with comparable accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20705","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SSL-R1: Self-Supervised Visual Reinforcement Post-Training for Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-22T15:46:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SSL-R1 reformulates visual SSL tasks into verifiable puzzles to supply rewards for RL post-training of MLLMs, yielding gains on multimodal benchmarks without external supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20398","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WebGen-R1: Incentivizing Large Language Models to Generate Functional and Aesthetic Websites with Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-04-22T10:04:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WebGen-R1 uses end-to-end RL with scaffold-driven generation and cascaded rewards for structure, function, and aesthetics to transform a 7B model into a generator of deployable multi-page websites that rivals much larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20146","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAKE: Self-aware Knowledge Exploitation-Exploration for Grounded Multimodal Named Entity Recognition","primary_cat":"cs.IR","submitted_at":"2026-04-22T03:17:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SAKE is an agentic framework for GMNER that uses uncertainty-based self-awareness and reinforcement learning to balance internal knowledge exploitation with adaptive external exploration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}