{"total":23,"items":[{"citing_arxiv_id":"2605.12058","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"H\\\"older Policy Optimisation","primary_cat":"cs.LG","submitted_at":"2026-05-12T12:45:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HölderPO unifies token aggregation in GRPO via the Hölder mean with dynamic p annealing, reporting 54.9% average math-benchmark accuracy and 93.8% ALFWorld success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11922","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StepCodeReasoner: Aligning Code Reasoning with Stepwise Execution Traces via Reinforcement Learning","primary_cat":"cs.SE","submitted_at":"2026-05-12T10:36:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StepCodeReasoner aligns code reasoning with verifiable stepwise execution traces via print anchors and bi-level GRPO reinforcement learning, reaching SOTA results on CRUXEval (91.1%) and LiveCodeBench (86.5%) for a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08862","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BubbleSpec: Turning Long-Tail Bubbles into Speculative Rollout Drafts for Synchronous Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-09T10:21:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BubbleSpec exploits long-tail bubbles in synchronous RL by using faster ranks' idle time to pre-generate rollout drafts for speculative decoding, reducing steps by 50% and raising throughput up to 1.8x while preserving exact synchrony.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08666","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Cancellation Hypothesis in Critic-Free RL: From Outcome Rewards to Token Credits","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:07:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The cancellation hypothesis shows how rollout-level rewards produce token-level credit assignment in critic-free RL through cancellation of opposing signals on shared tokens, with empirical support and batching interventions that enhance performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08441","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DUET: Optimize Token-Budget Allocation for Reinforcement Learning with Verifiable Rewards","primary_cat":"cs.LG","submitted_at":"2026-05-08T20:03:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DUET improves RLVR by allocating tokens across both prompt selection and rollout length, outperforming full-budget baselines even when using only half the tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08283","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HTPO: Towards Exploration-Exploitation Balanced Policy Optimization via Hierarchical Token-level Objective Control","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:38:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HTPO introduces hierarchical token-level objective control in RLVR to balance exploration and exploitation by grouping tokens according to difficulty, correctness, and entropy, yielding up to 8.6% gains on AIME benchmarks over DAPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07353","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Confidence-Aware Alignment Makes Reasoning LLMs More Reliable","primary_cat":"cs.AI","submitted_at":"2026-05-08T07:08:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CASPO trains LLMs via iterative direct preference optimization so that token-level confidence tracks step-wise correctness, then applies Confidence-aware Thought pruning at inference to improve both reliability and speed on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07137","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adaptive Negative Reinforcement for LLM Reasoning:Dynamically Balancing Correction and Diversity in RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-08T02:13:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Adaptive scheduling of penalties over training time plus confidence-based weighting of mistakes improves LLM performance on math reasoning benchmarks compared to fixed-penalty negative reinforcement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07114","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Where to Spend Rollouts: Hit-Utility Optimal Rollout Allocation for Group-Based RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-08T01:42:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HORA adaptively allocates rollouts using hit utility to improve Pass@K over compute-matched GRPO on math reasoning benchmarks while preserving Pass@1.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06326","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Teaching Thinking Models to Reason with Tools: A Full-Pipeline Recipe for Tool-Integrated Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-07T14:23:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A training recipe for tool-integrated reasoning models achieves state-of-the-art open-source results on math benchmarks such as 96.7% and 99.2% on AIME 2025 at 4B and 30B scales by balancing tool-use trajectories and optimizing for pass@k during SFT before stable RLVR.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06241","ref_index":3,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking RL for LLM Reasoning: It's Sparse Policy Selection, Not Capability Learning","primary_cat":"cs.CL","submitted_at":"2026-05-07T13:25:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RL for LLM reasoning acts as sparse policy selection at high-entropy tokens already present in the base model, enabling ReasonMaxxer—an efficient contrastive method that recovers most RL gains at three orders of magnitude lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27308","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BoostLoRA: Growing Effective Rank by Boosting Adapters","primary_cat":"cs.LG","submitted_at":"2026-04-30T01:43:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BoostLoRA grows effective adapter rank linearly via iterative boosting on hard examples with orthogonal low-rank updates, outperforming both single-shot ultra-low-rank adapters and full fine-tuning on math and code tasks with zero added inference overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22597","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Math Reasoning Evaluation: A Robust LLM-as-a-Judge Framework Beyond Symbolic Rigidity","primary_cat":"cs.AI","submitted_at":"2026-04-24T14:25:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An LLM-as-a-judge evaluation framework for math reasoning outperforms symbolic methods by accurately assessing diverse answer representations and formats.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20398","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WebGen-R1: Incentivizing Large Language Models to Generate Functional and Aesthetic Websites with Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-04-22T10:04:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WebGen-R1 uses end-to-end RL with scaffold-driven generation and cascaded rewards for structure, function, and aesthetics to transform a 7B model into a generator of deployable multi-page websites that rivals much larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18493","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Too Correct to Learn: Reinforcement Learning on Saturated Reasoning Data","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:43:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A parameter-free sampling strategy called CUTS combined with Mixed-CUTS training prevents mode collapse in RL for saturated LLM reasoning tasks and raises AIME25 Pass@1 accuracy by up to 15.1% over standard GRPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17928","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HEALing Entropy Collapse: Enhancing Exploration in Few-Shot RLVR via Hybrid-Domain Entropy Dynamics Alignment","primary_cat":"cs.LG","submitted_at":"2026-04-20T08:09:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HEAL mitigates entropy collapse in few-shot RLVR by selectively adding general-domain data and aligning trajectory-level entropy dynamics, matching full-shot performance with 32 target samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14142","ref_index":70,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From $P(y|x)$ to $P(y)$: Investigating Reinforcement Learning in Pre-train Space","primary_cat":"cs.LG","submitted_at":"2026-04-15T17:59:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PreRL applies reward-driven updates to P(y) in pre-train space, uses Negative Sample Reinforcement to prune bad reasoning paths and boost reflection, and combines with standard RL in Dual Space RL to outperform baselines on reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11446","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Low-rank Optimization Trajectories Modeling for LLM RLVR Acceleration","primary_cat":"cs.LG","submitted_at":"2026-04-13T13:28:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NExt accelerates RLVR training for LLMs by nonlinearly extrapolating low-rank parameter trajectories extracted from LoRA runs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10219","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cognitive Pivot Points and Visual Anchoring: Unveiling and Rectifying Hallucinations in Multimodal Reasoning Models","primary_cat":"cs.AI","submitted_at":"2026-04-11T13:59:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Multimodal reasoning models hallucinate at high-entropy cognitive bifurcation points due to loss of visual semantic anchoring, and the V-STAR training paradigm with HVAR rewards and FRM reflection mitigates this by reinforcing visual attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08527","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Demystifying OPD: Length Inflation and Stabilization Strategies for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-09T17:58:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPD for LLMs suffers length inflation and repetition collapse; StableOPD uses reference divergence and rollout mixing to prevent it and improve math reasoning performance by 7.2% on average.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08477","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SUPERNOVA: Eliciting General Reasoning in LLMs with Reinforcement Learning on Natural Instructions","primary_cat":"cs.AI","submitted_at":"2026-04-09T17:16:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SUPERNOVA adapts instruction-tuning data for RLVR and achieves up to 52.8% relative gains on general reasoning benchmarks like BBEH through targeted task selection and mixing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08281","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When to Trust Tools? Adaptive Tool Trust Calibration For Tool-Integrated Math Reasoning","primary_cat":"cs.CL","submitted_at":"2026-04-09T14:14:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ATTC reduces 'Tool Ignored' errors in tool-integrated reasoning by adaptively trusting tool results according to generated code confidence, yielding 4.1-7.5% gains across models and datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.13585","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention","primary_cat":"cs.CL","submitted_at":"2025-06-16T15:08:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MiniMax-M1 is a 456B parameter hybrid-attention MoE model trained with CISPO RL that achieves performance comparable or superior to DeepSeek-R1 and Qwen3-235B on reasoning and software engineering tasks while training in three weeks on 512 GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}