{"work":{"id":"c5006563-f3ec-438a-9e35-b7b484f34828","openalex_id":null,"doi":null,"arxiv_id":"2402.03300","raw_key":null,"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","authors":null,"authors_text":"Shao, Z","year":2024,"venue":"cs.CL","abstract":"Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.","external_url":"https://arxiv.org/abs/2402.03300","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T17:33:45.681471+00:00","pith_arxiv_id":"2402.03300","created_at":"2026-05-08T17:58:52.204454+00:00","updated_at":"2026-06-29T17:33:45.681471+00:00","title_quality_ok":true,"display_title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","render_title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models"},"hub":{"state":{"work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1384,"external_cited_by_count":null,"distinct_field_count":34,"first_pith_cited_at":"2024-03-20T08:08:54+00:00","last_pith_cited_at":"2026-06-26T11:30:42+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-06-29T17:39:04.605304+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":187},{"context_role":"method","n":124},{"context_role":"baseline","n":14},{"context_role":"other","n":5},{"context_role":"dataset","n":4}],"polarity_counts":[{"context_polarity":"background","n":175},{"context_polarity":"use_method","n":118},{"context_polarity":"unclear","n":22},{"context_polarity":"baseline","n":14},{"context_polarity":"use_dataset","n":4},{"context_polarity":"extend","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","claims":[{"claim_text":"Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieve","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:33.380609+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"5ddde4d2-4fcb-41cd-99da-ee5621a21f1c","orcid":null,"display_name":"Shao"}]},"error":null,"updated_at":"2026-05-13T17:24:04.759489+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T17:53:33.372741+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":241},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":224},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":174},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":107},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":91},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":87},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":82},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":76},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":74},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":72},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":72},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":72},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":66},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":61},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":61},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":61},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":53},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":49},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":46},{"title":"Does Reinforcement Learning Really Incentivize Reasoning Capacity in LLMs Beyond the Base Model?","work_id":"d854765a-e664-41c0-8655-21c4bf2e0cc4","shared_citers":41},{"title":"Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning","work_id":"0e0b7549-2bc4-4574-aa7f-588ffa16eaae","shared_citers":39},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":37},{"title":"Tulu 3: Pushing Frontiers in Open Language Model Post-Training","work_id":"28c9dbea-056a-48c2-8000-85f809827e45","shared_citers":37}],"time_series":[{"n":5,"year":2024},{"n":27,"year":2025},{"n":595,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:54.586665+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T17:53:32.456301+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-05-19T10:41:22.965109+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Foundation.OntologyPredicates","IndisputableMonolith.Cognition.AnimalZComplexityBound","IndisputableMonolith.Cognition.AnalogicalReasoningFromJCost","IndisputableMonolith.Linguistics.PhonemeInventoryBandFromRS","IndisputableMonolith.StandardModel.StrongCP","IndisputableMonolith.Foundation.SimplicialLedger.LorentzEmergence","IndisputableMonolith.Patterns.GrayCodeAxioms","IndisputableMonolith.Linguistics.LanguageAcquisitionFromJCost"],"query_chars":1136},"error":null,"updated_at":"2026-05-19T10:41:40.417758+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","claims":[{"claim_text":"Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieve","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:33.377395+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","claims":[{"claim_text":"Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieve","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.710350+00:00"}},"summary":{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","claims":[{"claim_text":"Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieve","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":241},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":224},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":174},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":107},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":91},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":87},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":82},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":76},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":74},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":72},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":72},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":72},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":66},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":61},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":61},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":61},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":53},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":49},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":46},{"title":"Does Reinforcement Learning Really Incentivize Reasoning Capacity in LLMs Beyond the Base Model?","work_id":"d854765a-e664-41c0-8655-21c4bf2e0cc4","shared_citers":41},{"title":"Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning","work_id":"0e0b7549-2bc4-4574-aa7f-588ffa16eaae","shared_citers":39},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":37},{"title":"Tulu 3: Pushing Frontiers in Open Language Model Post-Training","work_id":"28c9dbea-056a-48c2-8000-85f809827e45","shared_citers":37}],"time_series":[{"n":5,"year":2024},{"n":27,"year":2025},{"n":595,"year":2026}]},"authors":[{"id":"5ddde4d2-4fcb-41cd-99da-ee5621a21f1c","orcid":null,"display_name":"Shao","source":"manual","import_confidence":0.72}]},"citers":{"total":1384,"items":[{"citing_arxiv_id":"2606.27981","ref_index":228,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ToxiREX: A Dataset on Toxic REasoning in ConteXt","primary_cat":"cs.CL","submitted_at":"2026-06-26T11:30:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToxiREX is a new dataset of 128k Reddit comments in six languages with hierarchical annotations for implicit toxicity in conversational context based on an existing reasoning schema.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27739","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Weakest Link Tells It All: Outcome-Supervised Process Reward Modeling via Learnable Credit Assignment","primary_cat":"cs.LG","submitted_at":"2026-06-26T05:38:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LCA frames outcome-supervised PRM training as MIL, introduces SWS pooling for dependent steps, proves Bayes consistency under mild assumptions, and reports consistent gains over prior outcome-supervised baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27632","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Yuvion LLM: An Adversarially-Aware Large Language Model for Content And AI Safety","primary_cat":"cs.CL","submitted_at":"2026-06-26T01:12:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Yuvion LLM applies adversarially aware training and introduces the YLRE benchmark set, claiming superior safety robustness over larger models on multiple tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22402","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement learning to improve large language model-based automated code compliance systems","primary_cat":"cs.SE","submitted_at":"2026-06-21T09:17:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"P4IR applies supervised fine-tuning followed by GRPO reinforcement learning to reduce tree edit distance by up to 23.8% and Levenshtein distance by up to 38.6% versus SFT baselines while outperforming several frontier LLMs on code structure and semantics for automated building code compliance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21619","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Alignment Problem in Constrained Code Generation","primary_cat":"cs.SE","submitted_at":"2026-06-19T17:22:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Incomplete constrainers in constrained decoding push LLMs into low-probability program regions, making unconstrained decoding outperform constrained decoding on functional correctness across seven models and three benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17890","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Rollout Editing for Reducing Overthinking in RL-Trained Reasoning Models","primary_cat":"cs.CL","submitted_at":"2026-06-16T13:10:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dynamic Rollout Editing reduces overthinking in RL-trained LLMs by editing post-answer continuations in successful rollouts and preferring the edited versions within GRPO groups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17871","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StepGuard: Guarding Web Navigation via Single-Step Calibration","primary_cat":"cs.AI","submitted_at":"2026-06-16T12:42:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"StepGuard framework with DDPO and CANR claims SOTA navigation and answer accuracy on web benchmarks by switching policies and triggering reflection on low-confidence steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.16276","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpecAlign: Efficient Specification-Grounded Alignment of Large Language Models via Synthetic Data","primary_cat":"cs.AI","submitted_at":"2026-06-15T06:30:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SpecAlign synthesizes boundary-aware preference pairs directly from structured model specifications to train LLMs for improved rule compliance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13680","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning to Reason by Analogy via Retrieval-Augmented Reinforcement Fine-Tuning","primary_cat":"cs.CL","submitted_at":"2026-06-11T17:59:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RA-RFT trains a retriever to rank contexts by expected reasoning benefit and uses the retrieved analogies inside reinforcement fine-tuning, yielding 7.1 and 2.8 point gains on AIME 2025 over GRPO for two Qwen3 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13657","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dense Supervision, Sparse Updates: On the Sparsity and Geometry of On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-11T17:54:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy distillation produces coordinate-sparse, FFN-heavy updates that are full-rank but spectrally concentrated away from principal singular subspaces and near-zero source weights.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13349","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Passive Generation to Investigation: A Proactive Scientific Peer Review Agent","primary_cat":"cs.CL","submitted_at":"2026-06-11T13:38:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProReviewer is an MDP-formulated proactive peer review agent trained with SFT and RL on an 8B model that outperforms larger frontier LLMs on review quality metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12594","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pythagoras-Prover: Advancing Efficient Formal Proving via Augmented Lean Formalisation","primary_cat":"cs.AI","submitted_at":"2026-06-10T18:43:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pythagoras-Prover family achieves 93.0% on MiniF2F-Test with a 32B model and has its 4B version surpass a 671B prior model at pass@32, using ALF data augmentation and curriculum training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12273","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Fully Random Masking: Attention-Guided Denoising and Optimization for Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-10T16:14:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AGDO improves dLLM reasoning performance by determining denoising order and emphasizing tokens based on attention-derived dependencies rather than random masking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12191","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic Environment Engineering for Large Language Models: A Survey of Environment Modeling, Synthesis, Evaluation, and Application","primary_cat":"cs.CL","submitted_at":"2026-06-10T15:15:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"This survey categorizes agentic environments for LLMs by eight attributes and domains, introduces symbolic and neural synthesis paradigms with evaluation, and outlines four agent evolution pathways plus three environment evolution paradigms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11470","ref_index":206,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Periodic Table of LLM Reasoning: A Structured Survey of Reasoning Paradigms, Methods, and Failure Modes","primary_cat":"cs.CL","submitted_at":"2026-06-09T21:59:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A literature survey that introduces a taxonomy for LLM reasoning paradigms, analyzes methodological trends, and synthesizes failure modes from over 300 papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11167","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Faceted Interactivity Alignment in Full-Duplex Speech Models","primary_cat":"cs.CL","submitted_at":"2026-06-09T17:46:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-axis RL alignment technique improves pause handling, turn-taking, backchanneling, and interruption response in full-duplex spoken dialogue models by optimizing axis-specific rewards derived from human audio segments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20658","ref_index":275,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Expected Free Energy-based Planning as Variational Inference","primary_cat":"cs.AI","submitted_at":"2026-06-09T08:09:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EFE-based planning is formulated as variational free energy minimization with epistemic priors, decomposing into expected plan costs plus a complexity term.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08620","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPA: A SQL-Plan-Aware Reinforcement Learning Framework for Query Rewriting with LLMs","primary_cat":"cs.DB","submitted_at":"2026-06-07T13:12:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPA trains LLMs via plan-aware RL with adaptive reward shaping and self-improvement on slowdowns to produce faster query rewrites than rule-based or standard LLM methods on IID and OOD workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08545","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ishigaki-IDS: An Open-Weight Verifier-Aware Model for Information Delivery Specification Drafting in Building Information Modeling","primary_cat":"cs.CL","submitted_at":"2026-06-07T09:55:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Ishigaki-IDS is a verifier-aware LLM for generating validator-passing IDS files in BIM, reaching IDSAuditPass scores of 0.651-0.753 on a 166-case benchmark and cutting practitioner work time by 54.7%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08466","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ToolRec: Calibrated Preference Alignment for Query Recommendation in On-Device Assistants","primary_cat":"cs.IR","submitted_at":"2026-06-07T06:06:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ToolRec introduces dual-level calibration of click data and weighted KTO alignment to improve tool-invoking query recommendations in on-device assistants, reporting CTR gains in large-scale A/B tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07006","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RASFT: Rollout-Adaptive Supervised Fine-Tuning for Reasoning","primary_cat":"cs.LG","submitted_at":"2026-06-05T07:52:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RASFT is an adaptive SFT method that strengthens or relaxes expert imitation per problem based on on-policy rollout solvability and adds clipped reference-policy ratio to limit drift, reporting better results than standard SFT and RL on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06556","ref_index":172,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robots Need More than VLA and World Models","primary_cat":"cs.RO","submitted_at":"2026-06-04T10:43:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper identifies four missing interfaces (data autolabelling, embodiment retargeting, physics-grounded world models, and video-based reward inference) as the central bottleneck beyond VLA scaling for robot intelligence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05800","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SALT: When More Rollouts Don't Help in Group-Based Policy Optimization and How to Make Them Matter","primary_cat":"cs.LG","submitted_at":"2026-06-04T07:29:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SALT is a subspace-adaptive plug-in for GRPO that decomposes group-relative coefficients into shared and residual channels using mini-batch Gram geometry and amplifies residuals to mitigate signed cancellation in RLVR.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05330","ref_index":113,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Model of Multi-turn Human Persuadability Using Probabilistic Belief Tracing","primary_cat":"cs.CL","submitted_at":"2026-06-03T18:17:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PERSUASIONTRACE introduces a Bayesian-network simulated target for multi-turn persuasion that matches human belief dynamics (81 vs 80) better than LLM baselines (64) and enables process-level evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04935","ref_index":299,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Type of Inference is Active Inference?","primary_cat":"cs.AI","submitted_at":"2026-06-03T14:24:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EFE-based active inference planning is characterized as VFE on an augmented model plus entropy and planning corrections, with a derived message-passing implementation and grid-world validation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09883","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TD-Grokking: Learning from Zero-Reward Problems by Training-Time Decomposition","primary_cat":"cs.LG","submitted_at":"2026-06-03T06:40:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TD-Grokking decomposes zero-reward problems into hierarchical trees of solvable subproblems to generate training signals for RLVR on mathematical and medical tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04507","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Evolving Deep Research via Joint Generation and Evaluation","primary_cat":"cs.CL","submitted_at":"2026-06-03T06:38:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCORE is a shared-parameter co-evolutionary framework coupling generation and evaluation of deep research reports with a meta-harness to adapt evaluation standards as performance improves.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03980","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill-RM: Unifying Heterogeneous Evaluation Criteria via Agent Skill","primary_cat":"cs.LG","submitted_at":"2026-06-02T17:56:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Skill-RM unifies heterogeneous reward criteria by modeling reward computation as dynamic execution of a reusable Reward-Evaluation Skill within an agent framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03152","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cost-Aware Optimization for Agentic Query Execution","primary_cat":"cs.DB","submitted_at":"2026-06-02T04:52:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EnumGRPO is a self-improving optimizer for agentic query execution that reduces LLM-operator costs by ~317x while improving accuracy by 18% over a hybrid baseline across four databases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01476","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniOPD: Logit-Free On-Policy Distillation via Speculative Verification","primary_cat":"cs.LG","submitted_at":"2026-05-31T22:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniOPD replaces token-level logit matching in on-policy distillation with Monte Carlo chunk-level semantic verification and a peak-entropy scheduler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02643","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Inference Cost Attacks for Retrieval-Augmented Large Language Models","primary_cat":"cs.CR","submitted_at":"2026-05-31T15:11:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Poisoning external knowledge bases with LLM-agent-crafted documents can increase RAG inference token consumption by up to 13.12 times at over 90% success rate while preserving answer quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01247","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Where to Look: Can Foundation Models Reach a Target Viewpoint Through Active Exploration?","primary_cat":"cs.CV","submitted_at":"2026-05-31T14:00:10+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Introduces the TVR active viewpoint-matching task and TVRBench indoor simulation benchmark, where foundation models start at low single-digit success rates but reach 51.4% after visual-action SFT and multi-turn GRPO post-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00775","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GIRL-DETR: Gradient-Isolated Reinforcement Learning for Video Moment Retrieval","primary_cat":"cs.CV","submitted_at":"2026-05-30T15:40:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GIRL-DETR applies gradient-isolated RL post-training after freezing the backbone in a lightweight DETR framework to improve localization accuracy on video moment retrieval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00755","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Internalize the Temperature: On-Policy Self-Distillation as Policy Reheater for Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TS-OPSD internalizes temperature via on-policy self-distillation to reheat entropy-collapsed RL policies in LLMs, providing stronger initialization for further training than continued RL or rollout temperature adjustment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00728","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Empathy to Personalized Empathy: Adapting Empathetic Strategies to Individual Users","primary_cat":"cs.CL","submitted_at":"2026-05-30T13:49:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces personalized empathy task, PersonaEmp dataset from long-term interactions, and PereGRM reward framework that combines empathy evaluation with dynamic criteria for improved adaptation to user personas.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00651","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Improving MoE Safety Alignment via Decentralized Expertise","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MESA decentralizes safety duties in MoE LLMs via expert capacity reallocation and dynamic routing refinement based on optimal transport theory, yielding robust defense on harmful benchmarks while preserving helpfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00593","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPADER: Step-wise Peer Advantage with Diversity-Aware Exploration Rewards for Multi-Answer Question Answering","primary_cat":"cs.CL","submitted_at":"2026-05-30T07:47:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPADER proposes step-wise peer advantage and diversity-aware exploration rewards in RL for multi-answer QA, reporting improved recall and F1 on QAMPARI, Mintaka, WebQSP, and QUEST.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00564","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decomposed On-Policy Distillation for Vision-Language Reasoning: Steering Gradients for Visual Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-30T06:34:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Decomposes VLM distillation loss into orthogonal language and visual components and introduces Visual Gradient Steering to prioritize visual grounding over standard monolithic optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00562","ref_index":96,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepLatent: Think with Images via Parallel Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-30T06:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepLatent introduces a parallel latent visual reasoning framework with learnable 2D tokens and continuous RL, trained via distillation then RL, plus a new 180K dataset, claiming SOTA benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00519","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DriveAnchor: Progressive Anchor-based Flow Learning for Autonomous Driving Planning","primary_cat":"cs.RO","submitted_at":"2026-05-30T04:17:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DriveAnchor improves collision avoidance in autonomous driving planning via a three-stage anchor-based flow pipeline with pretraining on trajectory vocabulary, energy field post-training, and reward-refined fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00440","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SDR: Set-Distance Rewards for Radiology Report Generation","primary_cat":"cs.AI","submitted_at":"2026-05-30T00:10:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Set-to-set distances on sentence embeddings provide a permutation-invariant reward signal that improves GRPO training and enables efficient test-time scaling for vision-language models generating chest X-ray reports.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00437","ref_index":107,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EST-PRM: Stress-Testing Process Reward Models Before They Become Load-Bearing","primary_cat":"cs.LG","submitted_at":"2026-05-30T00:05:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EST-PRM stress-tests five PRM models on 4,687 reasoning chains from MATH-500, GSM8K, and PRMBench using three label-preserving transformations and reports model-specific vulnerability patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00392","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Detector-Evasive LLM Paraphrasing via Constrained Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-29T22:19:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DEPO formulates detector-evasive paraphrasing as a constrained MDP and solves it via Lagrangian primal-dual RL with GRPO-style updates to achieve evasion while satisfying a semantic-preservation constraint.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00324","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLMs Need Encoders for Semantic IDs Too","primary_cat":"cs.IR","submitted_at":"2026-05-29T20:01:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PrefixMem encoder for Semantic IDs improves deepest-level accuracy by up to 46% relative and full-SID retrieval recall by up to 22% relative on Pinterest data across LLM families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00251","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Capability Self-Assessment: Teaching LLMs to Know Their Limits","primary_cat":"cs.AI","submitted_at":"2026-05-29T18:32:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Reinforcement learning teaches LLMs to assess their own capabilities more effectively than supervised fine-tuning, preserves original skills, generalizes out of distribution, and aids local-cloud routing and data selection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00204","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"APE: Agentic Prompt Enhancer for Image Generation and Editing","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"APE post-trains small language models as single-agent or multi-agent prompt enhancers that improve visual alignment on image generation and editing benchmarks without altering the downstream visual model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31509","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill Reuse as Compression in Agentic RL","primary_cat":"cs.LG","submitted_at":"2026-05-29T16:28:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReuseRL augments agentic RL with an MDL-based compression penalty on skill reuse, proves a PAC-Bayes bound, and reports higher in- and out-of-distribution success on ALFWorld, TextWorld-Cooking, and Countdown-Stepwise versus GRPO and round-length baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31455","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIFT: Decoupled Rollouts and Importance-Weighted Fine-Tuning for Efficient Multi-Turn Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-29T15:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT achieves multi-turn RL performance via offline importance-weighted SFT by leveraging the equivalence of KL-regularized RL to weighted supervised learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31268","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mellum2 Technical Report","primary_cat":"cs.CL","submitted_at":"2026-05-29T13:01:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Mellum 2 is a 12B MoE model with 2.5B active parameters, trained on 10.6T tokens with MoE, GQA, SWA, and MTP, then post-trained into Instruct and Thinking variants, claimed competitive with 4B-14B models at 2.5B compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31228","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EchoRL: Reinforcement Learning via Rollout Echoing","primary_cat":"cs.LG","submitted_at":"2026-05-29T12:31:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EchoRL extracts entropy-based EchoClips from verified-success rollouts and feeds them as auxiliary supervision to prevent advantage collapse in RLVR for LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31192","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Regularizing Power of Language-Training Deepfake Detectors","primary_cat":"cs.CV","submitted_at":"2026-05-29T12:01:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A dual-encoder deepfake detector pairs a frozen specialist with a LoRA-tuned MLLM, trained first via binary alignment then via RL to reward explain-then-classify behavior, yielding improved cross-dataset performance and interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07602","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-Efficient Post-Training for LEGO Spatial-Physics Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PVPO is a sample-efficient RL method that improves semantic, geometric, and physical quality in LLM LEGO assembly generation by mitigating the PhysHack failure mode where validity alone fails to ensure fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31058","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Combinatorial Synthesis: Scaling Code RLVR via Atomic Decomposition and Recombination","primary_cat":"cs.CL","submitted_at":"2026-05-29T09:29:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ADR generates novel verifiable code tasks via atomic decomposition and recombination, outperforming heuristic baselines in originality, difficulty, and downstream RLVR gains across coding domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30919","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"De-attribute to Forget for LLM Unlearning","primary_cat":"cs.LG","submitted_at":"2026-05-29T07:03:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DareU reframes LLM unlearning as zeroing data attribution via RL rewards from an LLM classifier approximation, claiming better balance of forget quality and model utility than loss-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30914","ref_index":101,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Automating Formal Verification with Reinforcement Learning and Recursive Inference","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:59:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RLVR training raises verified Dafny pass rates from 9.7% to 31.1% on a filtered benchmark while a Lean proof scaffold lifts success from 46.2% to 69.2% on a pilot set and solves 7 of 42 prior unsolved tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30884","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GUI-C$^2$: Coarse-to-Fine GUI Grounding via Difficulty-Aware Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2026-05-29T06:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GUI-C² pairs a difficulty-scoring data pipeline with an area-gated coarse-to-fine RL mechanism to improve GUI grounding accuracy and training stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30859","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DARTS: Distribution-Aware Active Rollout Trajectory Shaping for Accelerating LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T05:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DARTS accelerates LLM RL training up to 1.77x by distribution-aware trajectory sampling and adaptive redundancy allocation that shapes rollouts toward conciseness without performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30832","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SLAT: Segment-Level Adaptive Trimming for Efficient CoT Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-29T04:37:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SLAT applies segment-level adaptive trimming in RL to reduce CoT reasoning length by 50% while maintaining competitive accuracy on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30824","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Planner-Centric Reinforcement Learning for Deep Research with Structure-Aware Reward","primary_cat":"cs.AI","submitted_at":"2026-05-29T04:18:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DecomposeR represents research plans as typed DAGs and uses two-stage planner-then-answerer RL to improve long-form research performance by 5.1-8.0 points over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30795","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Feat2Go: Visual Feature-Grounded Value Estimation for Embodied Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-29T03:36:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Feat2Go uses patch-level similarity from a visual world model and trend-based clustering to create progress targets for training value models that improve reward shaping in embodied RL for VLA policies, yielding large gains on ManiSkill3 and RoboTwin benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30789","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Smaller Models are Natural Explorers for Policy-Level Diversity in GRPO","primary_cat":"cs.LG","submitted_at":"2026-05-29T03:25:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Smaller models provide temporally correlated policy-level diversity that serves as structured exploration for training larger models in GRPO, yielding accuracy gains such as +8.8% on AIME 24 with reduced compute via the S2L-PO framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24892","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReviewGuard: Aligning LLM-Assisted Peer Review with Long-Term Scientific Impact","primary_cat":"cs.DL","submitted_at":"2026-05-29T02:05:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReviewGuard aligns LLM peer reviews with future citations via impact-aligned RL, achieving Spearman ρ=0.776 on rejected-then-published AI/ML papers versus 0.492 for human reviewers and flagging 5.6× more high-impact cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30712","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ExpGraph: Model-Agnostic Experience Learning with Graph-Structured Memory for LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-05-29T01:04:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ExpGraph builds a graph of summarized agent experiences and uses graph diffusion plus an RL-trained retrieval copilot to improve frozen LLM executors on QA, math, code, and agentic tasks without parameter updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30690","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ElasticMem: Latent Memory as a Learnable Resource for LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-05-29T00:34:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ElasticMem enables LLM agents to learn adaptive latent memory retrieval and elastic budget allocation, improving QA accuracy by 24-26% and ALFWorld success by 27-66% over baselines with lower token cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30666","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Tutoring Effectiveness Index: Predicting LLM Math Tutor Quality from Four Conversation Signals","primary_cat":"cs.CY","submitted_at":"2026-05-28T23:55:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The Tutoring Effectiveness Index (TEI) uses four signals from LLM conversations to select math tutoring responses, raising student improvement rates from 59.0% to 81.9% at N=8 on a frozen DeepSeek-R1-8B model without training or judges.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30639","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PInVerify: An Offline Embodied Benchmark for Active Instance Verification","primary_cat":"cs.CV","submitted_at":"2026-05-28T22:42:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PInVerify is a new offline embodied benchmark for active instance verification that supplies multi-view captures and 6-sector navigation topology, with MLLM baselines reaching 85.6% after fine-tuning but showing no reliable benefit from tested next-best-view strategies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00135","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On Effectiveness and Efficiency of Agentic Tool-calling and RL Training","primary_cat":"cs.LG","submitted_at":"2026-05-28T22:21:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Tool-calling evaluations for LLM agents are highly sensitive to implementation details such as random seeds and history handling, and two new techniques accelerate RL training with wall-clock speedup and no performance degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30553","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Destruction is a General Strategy to Learn Generation; Diffusion's Strength is to Take it Seriously; Exploration is the Future","primary_cat":"cs.LG","submitted_at":"2026-05-28T20:35:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Diffusion models are positioned as a general destroy-to-reconstruct strategy for learning generation that may outperform hand-crafted withholding methods in low-data regimes, with discussion of RL integration and exploration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30478","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving Small Language Models for Code Generation with Reinforcement Learning from Verification Feedback","primary_cat":"cs.SE","submitted_at":"2026-05-28T18:50:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RLVR with combined unit-test and static-analysis rewards improves pass@1 by up to 13pp on MBPP for 0.6B-1B models, while single-reward variants can induce shorter but less correct outputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30451","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VeriGate: Verifier-Gated Step-Level Supervision for GRPO","primary_cat":"cs.LG","submitted_at":"2026-05-28T18:20:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VeriGate adds verifier-gated step-level supervision to GRPO via cumulated PRM rewards and group-normalized token advantages, raising accuracy 20% and 12% on 1.5B and 7B models on MATH and six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30244","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement Learning with Robust Rubric Rewards","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:11:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RLR³ extends RLVR to criterion-level rubric verification via dual execution paths, minimal exposure masking, hierarchical aggregation, and saturation mitigation, delivering 4.7-point gains over base on 15 benchmarks with Qwen3-VL-30B-A3B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30219","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Should Models Change Their Minds? Contextual Belief Management in Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-28T16:52:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces BeliefTrack benchmark diagnosing three CBM failures in LLMs and shows RL with belief-state rewards cuts failure rates by 70.9% while representation steering cuts them by 46.1%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30154","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RL2ML: Finite-Rollout Surrogate Objectives from Reinforcement Learning to Maximum Likelihood","primary_cat":"cs.LG","submitted_at":"2026-05-28T16:14:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RL2ML introduces a parameterized family of surrogate objectives bridging RL and ML with unbiased gradient estimators, group-level update-scale analysis, and metric-dependent optimization for finite-rollout LLM training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30070","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Predictive Law for On-Policy Self-Distillation From World Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-28T15:17:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A linear relationship between initial student-self-teacher performance gap and OPSD improvement provides a predictive law across contexts and model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30014","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From GPS Points to Travel Patterns: Flexible and Semantic Trajectory Generation with LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-28T14:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HTP hierarchically generates travel patterns via RQ-VAE tokenization then uses SFT-tuned LLMs to produce conditioned trajectory sequences, outperforming baselines by 29.78% on two datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29951","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MuPHI: Learning Implicit Multimodal Harm Reasoning via Semantically Grounded Reward Optimization","primary_cat":"cs.AI","submitted_at":"2026-05-28T13:58:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MuPHI dataset and MuPHIRM reward-optimization framework improve VLM detection and reasoning on implicit multimodal harm with better out-of-distribution robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29860","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ESPO: Early-Stopping Proximal Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-28T12:40:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ESPO adds on-the-fly early stopping to PPO rollouts for LLM math reasoning using cumulative surrogate regret, improving AIME, AMC, and MATH-500 scores over PPO while cutting over 20% rollout tokens on a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29715","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"User-Aware Active Knowledge Acquisition for Emotional Support Dialogue","primary_cat":"cs.CL","submitted_at":"2026-05-28T10:13:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UKA is a gradient-free active dialogue learning framework using Theory-of-Mind uncertainty estimation to acquire user-aligned conversational knowledge, outperforming baselines in dialogue quality and user alignment across benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29697","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Trajectory Rewards: Step-level Credit Assignment for Agentic Search via Graph Modeling","primary_cat":"cs.AI","submitted_at":"2026-05-28T09:57:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GDCR assigns step-level rewards via distance to the answer node in a training-time ER graph and SAPO combines these with trajectory advantages for credit assignment in agentic search.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29548","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Larger Models Learn More: Effects of Capacity, Interference, and Rare-Task Retention","primary_cat":"cs.LG","submitted_at":"2026-05-28T08:02:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Larger models succeed on rare and complex tasks by reducing gradient interference from common tasks, allowing rare-task features to accumulate, as shown via synthetic task mixtures and OLMo pretraining from 4M to 4B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29398","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GDSD: Reinforcement Learning as Guided Denoiser Self-Distillation for Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-28T05:47:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GDSD reduces RL for dLLMs to likelihood-free self-distillation via a normalization-free logit-matching objective, outperforming ELBO methods with more stable training on LLaDA-8B and Dream-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29287","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniNote: A Unified Embedding Model for Multimodal Representation and Ranking","primary_cat":"cs.IR","submitted_at":"2026-05-28T03:11:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"UniNote proposes a two-stage trained unified embedding model (contrastive SFT then RL) for multimodal I2I retrieval that claims SOTA results and was deployed at Xiaohongshu with MRL for improved quality and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29256","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DynSess: Dynamic Session-Level Evaluation and Optimization Framework for Role-Playing Agents","primary_cat":"cs.CL","submitted_at":"2026-05-28T02:20:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DynSess supplies session-level rubrics for dialogue evaluation and uses the resulting rewards to train lighter role-playing agents via multi-turn lookahead search and DSPO/GSRPO optimization that match stronger baselines on human judgments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29247","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DenseSteer: Steering Small Language Models towards Dense Math Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-28T02:07:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DenseSteer is an inference-time steering framework that improves small LLMs' accuracy on math reasoning by modulating representations toward dense reasoning patterns with fewer but higher-density steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29198","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Guidance Contrastive Token Credit Assignment for Discrete Policy Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-28T00:17:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GCPO performs per-token credit assignment in discrete policy optimization by setting token advantages proportional to the difference in model predictions under positive versus negative prompts, outperforming GRPO and DAPO on text-to-image and chain-of-thought tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29156","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RUBRIC-ARROW: Alternating Pointwise Rubric Reward Modeling for LLM Post-training in Non-verifiable Domains","primary_cat":"cs.LG","submitted_at":"2026-05-27T22:46:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RUBRIC-ARROW is an alternating rubric generator and judge framework that uses probability-based scoring and pairwise preferences to improve pointwise reward modeling accuracy for LLM post-training in non-verifiable domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26122","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DocArena: Turning Raw Documents into Controllable Training Environments for Document Search Agents","primary_cat":"cs.CV","submitted_at":"2026-05-27T21:21:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DocArena automates creation of multimodal document QA training data via MLLM-based structuring and cross-page reasoning pairs, yielding agents with top retrieval and QA performance in unified tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29076","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Structured Prompt Optimization Meets Reinforcement Learning for Global and Local Interpretability over Complex Text","primary_cat":"cs.CL","submitted_at":"2026-05-27T20:29:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"eXTC learns a natural-language SOP via structured prompt optimization, distills it into a compact LM, and extends it with RL to deliver fast inference plus global rules and local traces while claiming benchmark gains over prior paradigms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28805","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniVerifier-M1: Multimodal Meta-Verifier with Explicit Structured Recalibration","primary_cat":"cs.CL","submitted_at":"2026-05-27T17:56:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"OmniVerifier-M1 is a generalist visual verifier using symbolic outputs for meta-verification and decoupled RL to outperform joint optimization for robust verification and agentic self-correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28791","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill-Conditioned Gated Self-Distillation for LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-27T17:49:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SGSD retrieves skill-mistake pairs to build a multi-teacher pool, validates teacher polarity via a verifier, and applies a gated objective to distill useful signals, yielding 6.2% average gains over GRPO on math benchmarks with Qwen3-1.7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28774","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent Explorative Policy Optimization for Multimodal Agentic Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-27T17:36:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AXPO addresses the Thinking-Acting Gap in agentic RL training by targeted resampling of tool calls in all-wrong subgroups, delivering +1.8pp gains over GRPO on nine multimodal benchmarks with an 8B model beating a 32B baseline on Pass@4.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28742","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CORE: Contrastive Reflection Enables Rapid Improvements in Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-27T17:01:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CORE distills contrasts between successful and unsuccessful reasoning traces into compact natural-language insights that enable faster model self-improvement on reasoning tasks with fewer rollouts than parametric or other non-parametric baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28713","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking as Compression: Your Reasoning Model is Secretly a Context Compressor","primary_cat":"cs.AI","submitted_at":"2026-05-27T16:36:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reasoning models naturally compress context via thinking traces, with reward-constrained optimization yielding 17-23% gains over baselines on long-context QA at high compression ratios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28699","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRACER: Turn-level Regret Matching with Inner Reinforcement Credit for Cooperative Multi-LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-27T16:25:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TRACER combines a controller-regret layer using regret matching for speak/skip decisions with a generation-credit layer using GSPO rewards to enable learned collaboration in multi-LLM reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28600","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Transformers Provably Learn to Internalize Chain-of-Thought","primary_cat":"cs.LG","submitted_at":"2026-05-27T15:17:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"L-layer transformers under Log-ICoT curriculum provably learn k-parity with poly(n) samples and log k stages, matching explicit CoT efficiency without inference overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28548","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GEM: Generative Supervision Helps Embodied Intelligence","primary_cat":"cs.CV","submitted_at":"2026-05-27T14:39:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GEM adds generative depth supervision to VLM pre-training and reports improved results on embodied benchmarks plus real-world robot execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28421","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DenoiseRL: Bootstrapping Reasoning Models to Recover from Noisy Prefixes","primary_cat":"cs.AI","submitted_at":"2026-05-27T12:52:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DenoiseRL optimizes recovery from noisy prefixes in weak-model reasoning failures to improve performance and self-correction on math and general reasoning benchmarks without external supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28409","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Post-training of LLMs for Code Generation With Offline Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-27T12:43:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Offline RL post-training boosts code generation performance in LLMs, with larger gains for small models and hard problems, using pre-collected datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28396","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ADWIN: Adaptive Windows for Horizon-Aware On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-27T12:33:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ADWIN adaptively selects training horizons in on-policy distillation via prefix alignment checks, cutting end-to-end cost by up to 4.1x while matching or exceeding full-rollout accuracy on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28388","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mechanistically Interpreting the Role of Sample Difficulty in RLVR for LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-27T12:25:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sample difficulty in RLVR shows non-monotonic effects on LLM reasoning, with easy/medium problems strengthening computation and reasoning features while hard problems often yield weak or harmful signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}