{"work":{"id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","openalex_id":null,"doi":null,"arxiv_id":"2110.14168","raw_key":null,"title":"Training Verifiers to Solve Math Word Problems","authors":null,"authors_text":"Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, Christopher Hesse, and John Schulman","year":2021,"venue":"cs.LG","abstract":"State-of-the-art language models can match human performance on many tasks, but they still struggle to robustly perform multi-step mathematical reasoning. To diagnose the failures of current models and support research, we introduce GSM8K, a dataset of 8.5K high quality linguistically diverse grade school math word problems. We find that even the largest transformer models fail to achieve high test performance, despite the conceptual simplicity of this problem distribution. To increase performance, we propose training verifiers to judge the correctness of model completions. At test time, we generate many candidate solutions and select the one ranked highest by the verifier. We demonstrate that verification significantly improves performance on GSM8K, and we provide strong empirical evidence that verification scales more effectively with increased data than a finetuning baseline.","external_url":"https://arxiv.org/abs/2110.14168","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T05:53:09.197804+00:00","pith_arxiv_id":"2110.14168","created_at":"2026-05-09T03:55:08.450539+00:00","updated_at":"2026-06-29T05:53:09.197804+00:00","title_quality_ok":true,"display_title":"Training Verifiers to Solve Math Word Problems","render_title":"Training Verifiers to Solve Math Word Problems"},"hub":{"state":{"work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1002,"external_cited_by_count":null,"distinct_field_count":31,"first_pith_cited_at":"2021-12-01T22:24:34+00:00","last_pith_cited_at":"2026-06-26T01:12:02+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-06-29T06:48:25.683919+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":124},{"context_role":"dataset","n":100},{"context_role":"method","n":7},{"context_role":"baseline","n":4},{"context_role":"other","n":2}],"polarity_counts":[{"context_polarity":"background","n":112},{"context_polarity":"use_dataset","n":98},{"context_polarity":"unclear","n":16},{"context_polarity":"use_method","n":7},{"context_polarity":"baseline","n":4}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Training Verifiers to Solve Math Word Problems","claims":[{"claim_text":"State-of-the-art language models can match human performance on many tasks, but they still struggle to robustly perform multi-step mathematical reasoning. To diagnose the failures of current models and support research, we introduce GSM8K, a dataset of 8.5K high quality linguistically diverse grade school math word problems. We find that even the largest transformer models fail to achieve high test performance, despite the conceptual simplicity of this problem distribution. To increase performance, we propose training verifiers to judge the correctness of model completions. At test time, we ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Training Verifiers to Solve Math Word Problems because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:03:31.006350+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"07c47add-2301-4164-9d06-23347fc20617","orcid":null,"display_name":"Karl Cobbe"},{"id":"edf3b705-ff6d-4713-9b26-27729234c00d","orcid":null,"display_name":"Vineet Kosaraju"},{"id":"9253b15a-b5df-4d8c-bad6-79bec0dcd54d","orcid":null,"display_name":"Mohammad Bavarian"},{"id":"27b716ab-b5bb-4619-9617-be39d50e5f88","orcid":null,"display_name":"Mark Chen"},{"id":"dfca2058-03d1-4251-92fe-0eaddf1dfcf0","orcid":null,"display_name":"Heewoo Jun"},{"id":"7b67bce8-4222-4c96-93ed-a2ccbbe6513d","orcid":null,"display_name":"Lukasz Kaiser"}]},"error":null,"updated_at":"2026-05-13T17:24:05.834629+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T17:53:40.535835+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":139},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":115},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":113},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":107},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":104},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":78},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":77},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":77},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":70},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":65},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":61},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":57},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":54},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":48},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":48},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":47},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":40},{"title":"Self-Consistency Improves Chain of Thought Reasoning in Language Models","work_id":"8c6d5a6b-b5cc-4105-9c84-9c34bb9375bb","shared_citers":38},{"title":"Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them","work_id":"513eb205-04ca-4722-9a43-a74e8cbe7e85","shared_citers":35},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":35},{"title":"Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters","work_id":"a8d50b24-bdf5-46ed-bc4f-2927dfd81f1d","shared_citers":35},{"title":"Instruction-Following Evaluation for Large Language Models","work_id":"3aa06177-125a-4f5a-8f4a-8070c5986c26","shared_citers":33},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":33},{"title":"Let's Verify Step by Step","work_id":"6d05b790-04c5-4fd2-91b2-ba1dfdd5770f","shared_citers":32}],"time_series":[{"n":1,"year":2021},{"n":6,"year":2022},{"n":16,"year":2023},{"n":30,"year":2024},{"n":16,"year":2025},{"n":401,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:55.152933+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T17:53:39.755816+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-06-29T00:28:12.424078+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Sport.PeakPerformanceFromJCost","IndisputableMonolith.Sports.PeakPerformanceFromPhiLadder","IndisputableMonolith.Cognition.AnimalZComplexityBound","IndisputableMonolith.Information.ChurchTuring","IndisputableMonolith.Education.MasteryThresholdFromGap45","IndisputableMonolith.Flight.Falsifiers","IndisputableMonolith.Materials.RoomTSuperconductorCandidate","IndisputableMonolith.MusicTheory.Rhythm"],"query_chars":938},"error":null,"updated_at":"2026-06-29T00:28:34.193407+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Training Verifiers to Solve Math Word Problems","claims":[{"claim_text":"State-of-the-art language models can match human performance on many tasks, but they still struggle to robustly perform multi-step mathematical reasoning. To diagnose the failures of current models and support research, we introduce GSM8K, a dataset of 8.5K high quality linguistically diverse grade school math word problems. We find that even the largest transformer models fail to achieve high test performance, despite the conceptual simplicity of this problem distribution. To increase performance, we propose training verifiers to judge the correctness of model completions. At test time, we ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Training Verifiers to Solve Math Word Problems because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:03:31.004378+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Training Verifiers to Solve Math Word Problems","claims":[{"claim_text":"State-of-the-art language models can match human performance on many tasks, but they still struggle to robustly perform multi-step mathematical reasoning. To diagnose the failures of current models and support research, we introduce GSM8K, a dataset of 8.5K high quality linguistically diverse grade school math word problems. We find that even the largest transformer models fail to achieve high test performance, despite the conceptual simplicity of this problem distribution. To increase performance, we propose training verifiers to judge the correctness of model completions. At test time, we ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Training Verifiers to Solve Math Word Problems because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.713246+00:00"}},"summary":{"title":"Training Verifiers to Solve Math Word Problems","claims":[{"claim_text":"State-of-the-art language models can match human performance on many tasks, but they still struggle to robustly perform multi-step mathematical reasoning. To diagnose the failures of current models and support research, we introduce GSM8K, a dataset of 8.5K high quality linguistically diverse grade school math word problems. We find that even the largest transformer models fail to achieve high test performance, despite the conceptual simplicity of this problem distribution. To increase performance, we propose training verifiers to judge the correctness of model completions. At test time, we ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Training Verifiers to Solve Math Word Problems because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":139},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":115},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":113},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":107},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":104},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":78},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":77},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":77},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":70},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":65},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":61},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":57},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":54},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":48},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":48},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":47},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":40},{"title":"Self-Consistency Improves Chain of Thought Reasoning in Language Models","work_id":"8c6d5a6b-b5cc-4105-9c84-9c34bb9375bb","shared_citers":38},{"title":"Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them","work_id":"513eb205-04ca-4722-9a43-a74e8cbe7e85","shared_citers":35},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":35},{"title":"Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters","work_id":"a8d50b24-bdf5-46ed-bc4f-2927dfd81f1d","shared_citers":35},{"title":"Instruction-Following Evaluation for Large Language Models","work_id":"3aa06177-125a-4f5a-8f4a-8070c5986c26","shared_citers":33},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":33},{"title":"Let's Verify Step by Step","work_id":"6d05b790-04c5-4fd2-91b2-ba1dfdd5770f","shared_citers":32}],"time_series":[{"n":1,"year":2021},{"n":6,"year":2022},{"n":16,"year":2023},{"n":30,"year":2024},{"n":16,"year":2025},{"n":401,"year":2026}]},"authors":[{"id":"dfca2058-03d1-4251-92fe-0eaddf1dfcf0","orcid":null,"display_name":"Heewoo Jun","source":"manual","import_confidence":0.72},{"id":"07c47add-2301-4164-9d06-23347fc20617","orcid":null,"display_name":"Karl Cobbe","source":"manual","import_confidence":0.72},{"id":"7b67bce8-4222-4c96-93ed-a2ccbbe6513d","orcid":null,"display_name":"Lukasz Kaiser","source":"manual","import_confidence":0.72},{"id":"27b716ab-b5bb-4619-9617-be39d50e5f88","orcid":null,"display_name":"Mark Chen","source":"manual","import_confidence":0.72},{"id":"9253b15a-b5df-4d8c-bad6-79bec0dcd54d","orcid":null,"display_name":"Mohammad Bavarian","source":"manual","import_confidence":0.72},{"id":"edf3b705-ff6d-4713-9b26-27729234c00d","orcid":null,"display_name":"Vineet Kosaraju","source":"manual","import_confidence":0.72}]},"citers":{"total":1002,"items":[{"citing_arxiv_id":"2606.27632","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Yuvion LLM: An Adversarially-Aware Large Language Model for Content And AI Safety","primary_cat":"cs.CL","submitted_at":"2026-06-26T01:12:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Yuvion LLM applies adversarially aware training and introduces the YLRE benchmark set, claiming superior safety robustness over larger models on multiple tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24790","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grad Detect: Gradient-Based Hallucination Detection in LLMs","primary_cat":"cs.LG","submitted_at":"2026-06-23T16:46:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Grad Detect uses internal gradient patterns from one inference pass to predict LLM hallucinations and abstention, outperforming confidence and sampling baselines on Q&A benchmarks with most signal in the final five layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12364","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On Subquadratic Architectures: From Applications to Principles","primary_cat":"cs.LG","submitted_at":"2026-06-10T17:33:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"xLSTM outperforms Mamba-2 and Gated DeltaNet on tasks with complex dependencies because its gating scheme enables more flexible and stable state tracking and memory accumulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07006","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RASFT: Rollout-Adaptive Supervised Fine-Tuning for Reasoning","primary_cat":"cs.LG","submitted_at":"2026-06-05T07:52:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RASFT is an adaptive SFT method that strengthens or relaxes expert imitation per problem based on on-policy rollout solvability and adds clipped reference-policy ratio to limit drift, reporting better results than standard SFT and RL on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01400","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consistent and Distinctive: LLM Benchmark Efficiency via Maximum Independent Set Prompt Selection on Similarity Graphs","primary_cat":"cs.CL","submitted_at":"2026-05-31T18:45:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A graph-based MIS prompt selection method on embedding similarity graphs yields reduced benchmark subsets with highly consistent LLM rankings (Kendall's W ≥ 0.90 in 99.2% of cases) and 25-48% size reduction at higher thresholds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01168","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking Economically: A Hierarchical Framework for Adaptive-Complexity Reasoning in LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-31T11:20:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HAB applies coarse-to-fine budgeting to LLM reasoning, predicting per-problem depth and learning intra-step token budgets via PPL comparisons and adaptive Pareto optimization, yielding higher accuracy and lower token use than standard CoT on GSM8K and MATH500.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01075","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Generalization Gap in Self-Evolving Language Model Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-31T07:43:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Closed-loop self-evolution on LLMs improves reasoning on Knights and Knaves tasks but plateaus short of oracle-supervised levels, with multi-turn revision nearly matching it for large models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00869","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing LLM Metacognition via Cognitive Pairwise Training","primary_cat":"cs.LG","submitted_at":"2026-05-30T19:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CPT is introduced as a pairwise reasoning-trace comparison stage that improves the reasoning-metacognition trade-off over standard SFT+RL pipelines across model scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00819","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mitigating Hallucinations in Large Language Models Via Decoder Layer Skipping","primary_cat":"cs.AI","submitted_at":"2026-05-30T17:40:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeLask dynamically skips hallucination-prone decoder layers in LLMs by measuring gradient driftance via cosine similarity and partially aggregating states instead of full skipping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00671","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AXIOM: A Trust-First Neuro-Symbolic Execution Architecture for Verifiable Mathematical Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-30T10:55:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AXIOM routes math problems via LLM canonicalization to 3100+ deterministic CAS handlers, reporting 94.36% correctness at 100% trust on parseable MATH benchmark items with no confident-wrong answers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00660","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FineVerify: Scaling Test-Time Compute with Fine-Grained Self-Verification for Agentic Search","primary_cat":"cs.CL","submitted_at":"2026-05-30T10:21:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FineVerify improves agentic search accuracy by decomposing questions into sub-questions for per-check verification of sampled trajectories, outperforming score-based baselines on four benchmarks with two models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00651","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Improving MoE Safety Alignment via Decentralized Expertise","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MESA decentralizes safety duties in MoE LLMs via expert capacity reallocation and dynamic routing refinement based on optimal transport theory, yielding robust defense on harmful benchmarks while preserving helpfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00642","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hidden Thoughts Are Not Secret: Reasoning Trace Exposure in LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-30T09:37:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"REP elicits hidden LLM reasoning traces via in-context shadow demonstrations, raising similarity to internal traces while retaining distillation utility across datasets and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00609","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CARE-RL: Capability-Aware Reinforcement Learning for Mitigating Cross-Domain Conflicts","primary_cat":"cs.LG","submitted_at":"2026-05-30T08:18:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CARE-RL combines PA-GRM for task-adaptive rewards on open-ended tasks and DACSP for modulating RL updates using historical capability directions, reporting higher total average scores than baselines on Qwen models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00539","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GNMR: Runtime Stability Control for Low-Precision Large Language Model Training","primary_cat":"cs.LG","submitted_at":"2026-05-30T05:11:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GNMR is a gradient-norm-based controller that maps local stability signals to budgeted recovery actions to stabilize low-precision LLM training while preserving quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00494","ref_index":73,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProjQ: Project-and-Quantize for Adapter-Aware LLM Compression","primary_cat":"cs.LG","submitted_at":"2026-05-30T02:54:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProjQ constrains post-training quantization noise to a low-rank manifold through orthogonal subspace projection, enabling better compensation by LoRA adapters and preserving greater model plasticity than standard PTQ.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00487","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TAPS: Target-Aware Prefix Tree Selection for Diffusion-Drafted Speculative Decoding","primary_cat":"cs.AI","submitted_at":"2026-05-30T02:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TAPS converts diffusion marginal probabilities into path-conditioned acceptance estimates to select prefix-closed subtrees under a fixed verification budget, achieving up to 7.9x end-to-end speedup over autoregressive decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07612","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: Anthropomorphic Misalignment Research Needs Stronger Evidence","primary_cat":"cs.CY","submitted_at":"2026-05-29T16:38:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Position paper calling for stronger evidentiary standards and a diagnostic checklist in anthropomorphic misalignment research.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31494","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consolidating Rewarded Perturbations for LLM Post-Training","primary_cat":"cs.CL","submitted_at":"2026-05-29T16:16:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoRP consolidates reward-weighted perturbations into a single model via low-rank structure, improving base LLMs by 8.1 points on average while using one-tenth the budget of prior ensembles and one forward pass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31268","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mellum2 Technical Report","primary_cat":"cs.CL","submitted_at":"2026-05-29T13:01:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Mellum 2 is a 12B MoE model with 2.5B active parameters, trained on 10.6T tokens with MoE, GQA, SWA, and MTP, then post-trained into Instruct and Thinking variants, claimed competitive with 4B-14B models at 2.5B compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31175","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Efficient LLMs Annealing with Principled Sample Selection","primary_cat":"cs.CL","submitted_at":"2026-05-29T11:42:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiReCT reformulates LLM annealing sample selection as a constrained optimization problem that enforces per-sample gradient directions aligned with the loss landscape's curvature.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31164","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"D$^3$: Dynamic Directional Graph-Constrained Data Scheduling for LLM Training","primary_cat":"cs.CL","submitted_at":"2026-05-29T11:13:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"D³ introduces a dynamic directional graph-constrained framework that models sample interactions via loss dependencies to derive an optimized training sequence for LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31159","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Trust-Region Behavior Blending for On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-29T11:06:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRB introduces a KL-trust-region warmup for on-policy distillation that blends toward teacher behavior early in training and anneals to zero, reporting the highest average performance across two math-reasoning distillation experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03645","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Shape of Addition: Geometric Structures of Arithmetic in Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-29T10:55:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM residual streams during addition form an Iso-Raw-Sum Trajectory anchored by digit semantics and modulated by continuous carry signals, with errors arising as geometric slippages across quantization thresholds in a noisy model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07604","ref_index":24,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Contribution Weights: A Geometrical Analysis of Self-Attention Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:40:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contribution Weights combine attention, value magnitude, and directional alignment to measure token influence more faithfully than attention alone, and show attention sinks actively suppress information via a convex sink-rate to output-norm relationship.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07603","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MetaEvo: A Meta-Optimization Framework for Experience-Driven Agent Evolution","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:31:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MetaEvo is a two-stage framework using preference optimization for principle abstraction followed by modular reuse to enable continual improvement of LLM agents on reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31025","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRACE: Discovering Task-Specific Parameter via Adaptation-Aware Probing for Continual Fine-Tuning","primary_cat":"cs.CL","submitted_at":"2026-05-29T08:57:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRACE uses a brief adaptation probe to isolate task-specific parameters and updates only those during continual LLM fine-tuning to reduce catastrophic forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30992","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Eigenvectors of Experts are Training-free Non-collapsing Routers","primary_cat":"cs.LG","submitted_at":"2026-05-29T08:27:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SSMoE uses eigenvectors of expert weights via SVD to build training-free non-collapsing routers for SMoE models in language and vision tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30876","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"dMoE: dLLMs with Learnable Block Experts","primary_cat":"cs.CL","submitted_at":"2026-05-29T06:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dMoE aggregates token expert distributions to block level in dLLMs, cutting unique experts from 69.5 to 14.6, memory by 76-80%, and latency by 1.14-1.66x while retaining 99.11% performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30859","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DARTS: Distribution-Aware Active Rollout Trajectory Shaping for Accelerating LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T05:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DARTS accelerates LLM RL training up to 1.77x by distribution-aware trajectory sampling and adaptive redundancy allocation that shapes rollouts toward conciseness without performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30844","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fine-Tuning Improves Information Conveyance in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-29T05:05:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fine-tuning reorganizes uncertainty in LLMs into more efficient information conveyance, as shown by stronger length-entropy correlations and a tripling of entropy-semantic diversity links after controls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30753","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Diffusion LLMs via Temporal-Spatial Parallel Decoding and Confidence Extrapolation","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:29:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces TSPD with a trajectory-feature controller and training-free CE to reduce denoising steps in dLLMs while aiming to preserve quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29790","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evolve as a Team: Collaborative Self-Evolution for LLM-based Multi-Agent Systems","primary_cat":"cs.MA","submitted_at":"2026-05-28T11:40:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Meta-Team is a collaborative self-evolution framework that turns multi-agent execution experience into reusable improvements at agent, coordination, and team levels, outperforming baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29613","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoding Strategies for Diffusion-Based ASR: A Systematic Evaluation of Confidence-Based Thresholding","primary_cat":"eess.AS","submitted_at":"2026-05-28T08:48:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Threshold-based decoding for diffusion ASR outperforms fixed schemes by accepting high-confidence tokens early and matches autoregressive accuracy with better speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29612","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CONCAT: Consensus- and Confidence-Driven Ad Hoc Teaming for Efficient LLM-Based Multi-Agent Systems","primary_cat":"cs.MA","submitted_at":"2026-05-28T08:47:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CONCAT introduces a consensus- and confidence-driven ad hoc teaming method that reduces communication overhead in LLM-based multi-agent systems by up to 50% latency while improving efficiency ratio without any training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29511","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DynaGraph: Lightweight Multi-Model Interaction Framework via Dynamic Topological Reconfiguration","primary_cat":"cs.MA","submitted_at":"2026-05-28T07:33:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DynaGraph is a multi-model framework that multiplexes PEFT adapters on a shared base model with evaluator-driven dynamic topology reconfiguration and hierarchical self-healing to achieve near-72B performance on reasoning benchmarks using an 8B model while reducing latency and tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29295","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoGM: Learning to Merge LLMs via Evolutionary Generative Optimization","primary_cat":"cs.NE","submitted_at":"2026-05-28T03:22:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EvoGM uses a dual-generator architecture with cycle-consistent learning on winner-loser pairs from search history to optimize LLM merging coefficients inside a multi-round evolutionary pipeline and reports outperformance over baselines on seen and unseen tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23901","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLMs as Noisy Channels: A Shannon Perspective on Model Capacity and Scaling Laws","primary_cat":"cs.LG","submitted_at":"2026-05-22T17:59:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The Shannon Scaling Law treats LLM training as noisy-channel transmission and predicts U-shaped performance degradation when signal-to-noise ratio falls below a threshold, outperforming monotonic scaling laws on Pythia and OLMo2 data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23872","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training-Free Looped Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-22T17:31:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Training-free looped transformers retrofit recurrence to frozen models via damped ODE sub-steps on mid-stack blocks, yielding gains such as +2.64 pp on MMLU-Pro for Qwen3-4B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23857","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Strong Teacher Not Needed? On Distillation in LLM Pretraining","primary_cat":"cs.LG","submitted_at":"2026-05-22T17:16:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Even small or undertrained teachers improve larger LLM students via distillation with tuned loss mixing, while stronger teachers can saturate or reverse gains and distillation aids generalization more than in-domain fit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23660","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Using Large Language Models in Physics Education","primary_cat":"physics.ed-ph","submitted_at":"2026-05-22T14:11:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Frontier LLMs from late 2025 reach near-perfect scores on text-based physics problem solving and show improved human-grading alignment, yet still struggle to assign partial credit for flawed reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23454","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ARES: Automated Rubric Synthesis for Scalable LLM Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-05-22T10:09:28+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23315","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Convergence Without Understanding: When Language Models Agree on Representations but Disagree on Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-22T07:32:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Representational convergence across 16 LLMs on 800 reasoning problems is stronger for failed tasks and pre-decision stages but shows minimal causal influence on predictions, pointing to shared processing constraints over shared reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23200","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adaptive Mass-Segmented KV Compression for Long-Context Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-22T03:32:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AMS KV compression adaptively partitions the cache by attention mass regions and assigns quotas to protect contiguous reasoning blocks during long-context LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23170","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Positional Failures in Long-Context LLMs: A Blind Spot in Reasoning Benchmarks","primary_cat":"cs.CL","submitted_at":"2026-05-22T02:42:41+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Audits reveal no reasoning benchmark controls position/filler/length jointly; CRE shows LLMs drop up to 88pp on middle-position tasks at 64K context, with diagnostic probe supporting positional cause.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23163","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fast-dDrive: Efficient Block-Diffusion VLM for Autonomous Driving","primary_cat":"cs.CL","submitted_at":"2026-05-22T02:31:32+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23081","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ThriftAttention: Selective Mixed Precision for Long-Context FP4 Attention","primary_cat":"cs.LG","submitted_at":"2026-05-21T22:28:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ThriftAttention recovers 89.1% of the FP16 quality gap versus pure FP4 attention by running only 5% of query-key blocks in FP16 on long-context benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23078","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GEMQ: Global Expert-Level Mixed-Precision Quantization for MoE LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-21T22:23:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GEMQ applies global LP-based expert importance estimation and router fine-tuning within progressive quantization to cut memory and speed inference in MoE LLMs with little accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23074","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PathCal: State-Aware Reflection-Marker Calibration for Efficient Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-21T22:13:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PathCal calibrates reasoning paths by type-aware soft rebalancing of reflection-marker logits at uncertain states, yielding better efficiency-performance trade-offs on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23057","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ModeSwitch-LLM: A Lightweight Phase-Aware Controller for Cross-Mode LLM Inference on a Single GPU","primary_cat":"cs.LG","submitted_at":"2026-05-21T21:46:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A rule-based controller selects among FP16, quantized, speculative, and hybrid modes for single-GPU LLM inference, delivering 2.1x latency speedup and 51.7% lower energy per token with near-baseline accuracy on Llama-3.1-8B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23024","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Deterministic Horizon: Impossibility Results as Design Specifications for Trustworthy AI Systems","primary_cat":"cs.AI","submitted_at":"2026-05-21T20:48:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Converts impossibility theorems into architecture-dependent accuracy ceilings and design rules for transformers and other AI subfields, with the Deterministic Horizon measured at 19-31 across twelve models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"hypothesis is bounded state-action concentrability, a property of data coverage. The two characterisations are complementary: theirs locates the reductions between paradigms available in the offline-RL regime; ours locates the verifier sample-complexity gap in the Chapter 2. The Deterministic Horizon25 supervised-learning regime. Uesato et al. [35] and Lightman et al. [34] documented the supervised-learning advantage empirically without tight sample-complexity characteri- sation; Theorem 2.36 makes the if-and-only-if characterisation precise: the separation holds iff chain non-redundancy holds. Tool use and external computation.Schick et al. [36], Yao et al. [37], and Shinn et al. [118] established tool-augmented reasoning as a deployment pattern."},{"citing_arxiv_id":"2605.23023","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How to Steer Your Multi-Agent System: Human-LLM Collaborative Planning","primary_cat":"cs.MA","submitted_at":"2026-05-21T20:47:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Formalizes design space for human-LLM collaborative planning along mode, scope, and level axes; evaluates AMBIPOM prototype via user study and benchmark revealing hybrid workflows and trade-offs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22939","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learnability-Informed Fine-Tuning of Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-21T18:16:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LIFT is a learnability-informed SFT algorithm for diffusion LMs that aligns token difficulty with diffusion time steps, yielding up to 3x gains on AIME'24 and AIME'25 over standard SFT baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22905","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EVE-Agent: Evidence-Verifiable Self-Evolving Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T17:47:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EVE-Agent adds an evidence verifier to the proposer-solver loop that rewards spans by marginal accuracy gain, producing self-generated but inspectable training examples for search agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22731","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post-Training is About States, Not Tokens: A State Distribution View of SFT, RL, and On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-21T17:03:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A state distribution view of post-training shows that on-policy supervision from the learner itself can outperform fixed-dataset SFT and preserve retention better than aggressive supervised updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22566","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GraphFlow: A Graph-Based Workflow Management for Efficient LLM-Agent Serving","primary_cat":"cs.LG","submitted_at":"2026-05-21T14:45:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GraphFlow uses a unified wGraph to dynamically instantiate workflows and manage KV caches for LLM agents, reporting 4.95 pp average gains and 4x memory reduction on five benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22263","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tailoring Teaching to Aptitude: Direction-Adaptive Self-Distillation for LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T10:07:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASD improves math reasoning in LLMs by adaptively directing self-distillation based on per-token entropy to balance exploration and step accuracy, outperforming prior self-distillation and RLVR baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22205","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill Weaving: Efficient LLM Improvement via Modular Skillpacks","primary_cat":"cs.AI","submitted_at":"2026-05-21T09:12:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SkillWeave partitions LLM capabilities into compressible skillpacks to deliver strong multi-domain performance with a 9B model that outperforms larger monolithic LLMs and achieves up to 4x speedup on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22106","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ArborKV: Structure-Aware KV Cache Management for Scaling Tree-based LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-21T07:40:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ArborKV uses search-structure awareness to evict low-reuse KV states in Tree-of-Thoughts inference, delivering up to 4x memory savings with near-full accuracy retention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22064","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hy-MT2: A Family of Fast, Efficient and Powerful Multilingual Translation Models in the Wild","primary_cat":"cs.CL","submitted_at":"2026-05-21T07:00:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21856","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Illusion of Reasoning: Exposing Evasive Data Contamination in LLMs via Zero-CoT Truncation","primary_cat":"cs.LG","submitted_at":"2026-05-21T01:06:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ZCP detects direct and evasive data contamination in LLMs by truncating CoT reasoning and contrasting zero-CoT accuracy on original versus perturbed isomorphic datasets, plus a Contamination Confidence metric.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21851","ref_index":5,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPPO: Bayesian Value Recursion for Token-Level Credit Assignment in LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T00:55:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPPO derives token-level advantages for LLM RL via Bayesian recursion on oracle signals, recovering prior distillation methods as a special case and showing gains on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22883","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Energy per Successful Goal: Goal-Level Energy Accounting for Agentic AI Systems","primary_cat":"cs.AI","submitted_at":"2026-05-20T22:55:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes EpG and OOI metrics showing agentic workflows use 4.33x more energy per successful goal than linear baselines due to orchestration structure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21792","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Residual Skill Optimization for Text-to-SQL Ensembles","primary_cat":"cs.CL","submitted_at":"2026-05-20T22:36:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Residual skill optimization creates complementary Text-to-SQL agents by training each new skill on prior ensemble failures, yielding accuracy gains on Spider2-Lite and transfer to other dialects and tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21770","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Manifold-Guided Attention Steering","primary_cat":"cs.LG","submitted_at":"2026-05-20T22:06:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAGS learns low-dimensional subspaces from correct versus incorrect reasoning traces and applies targeted projection corrections to attention heads when they deviate from the correctness manifold during inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21748","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RankJudge: A Multi-Turn LLM-as-a-Judge Synthetic Benchmark Generator","primary_cat":"cs.CL","submitted_at":"2026-05-20T21:20:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RankJudge creates paired multi-turn conversations with isolated single-turn flaws to generate unambiguous benchmarks for LLM-as-a-judge systems across ML, biomedicine, and finance domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21699","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"X-Token: Projection-Guided Cross-Tokenizer Knowledge Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-20T19:59:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"X-Token proposes projection-guided P-KL and H-KL losses to fix uncommon-token suppression and over-conservative matching in logit-based cross-tokenizer distillation, yielding gains over GOLD on Llama-3.2-1B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21442","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"torchtune: PyTorch native post-training library","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:32:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"torchtune is a modular PyTorch library for LLM post-training that delivers competitive performance and memory efficiency while supporting rapid research iteration through hackable components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21427","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PALS: Power-Aware LLM Serving for Mixture-of-Experts Models","primary_cat":"cs.AI","submitted_at":"2026-05-20T17:19:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PALS adds dynamic GPU power capping to LLM serving frameworks like vLLM, jointly tuning it with batch size via offline models and feedback control to improve energy efficiency up to 26.3% and cut QoS violations 4-7x on dense and MoE models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21408","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TCARD: Nearly Balanced Two-Level Designs with Treatment Cardinality Constraints with an Application to LLM Prompt Engineering","primary_cat":"stat.ME","submitted_at":"2026-05-20T17:06:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes nearly balanced TCARDs that minimize the first two generalized word-length pattern components, defines Φ_BCD criterion linked to classical optimality, and constructs designs via coordinate exchange with simulation-calibrated weights for LLM prompt engineering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21404","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Twelve LLM Agent Benchmark Papers Disclose About Themselves: A Pilot Audit and an Open Scoring Schema","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:02:36+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pilot audit of twelve LLM benchmark papers finds mean disclosure score of 0.38/1.0 for agent benchmarks versus 0.66 for classical ones, with zero papers disclosing inference costs or full harness specs, and releases an open JSON schema plus scoring CSV.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21318","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TextReg: Mitigating Prompt Distributional Overfitting via Regularized Text-Space Optimization","primary_cat":"cs.CL","submitted_at":"2026-05-20T15:47:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TextReg mitigates prompt distributional overfitting via regularized text-space optimization, reporting up to +16.5% OOD accuracy gains over prior methods on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21177","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ChunkFT: Byte-Streamed Optimization for Memory-Efficient Full Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-20T13:44:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ChunkFT enables full-parameter fine-tuning of Llama 3-8B on one 24 GB GPU and Llama 3-70B on two 80 GB GPUs by streaming gradients over dynamically activated sub-tensors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21147","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SMoA: Spectrum Modulation Adapter for Parameter-Efficient Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-20T13:19:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SMoA is a new PEFT adapter that uses block-wise Hadamard-modulated low-rank branches on spectral partitions to cover more pretrained spectral directions than standard LoRA under a smaller parameter budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21125","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Advantage Collapse in Group Relative Policy Optimization: Diagnosis and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-05-20T12:57:37+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20994","ref_index":75,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Context-Invariant Safety Alignment for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T10:33:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces AIR, an asymmetric regularization that anchors open-ended safety prompts to verifiable ones via stop-gradient, improving invariance and accuracy when combined with group preference optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20950","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Focus-then-Context: Subject-Centric Progressive Visual Token Reduction for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T09:37:53+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPpruner reduces visual tokens in VLMs via focus identification followed by context-aware scanning, retaining 22.2% tokens for 2.53x speedup on Qwen2.5-VL with negligible accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20813","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PulseCol: Periodically Refreshed Column-Sparse Attention for Accelerating Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T07:06:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PulseCol introduces periodically refreshed column-sparse attention to achieve up to 1.95x speedup over FlashAttention in diffusion LLMs with maintained model quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20722","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AGPO: Adaptive Group Policy Optimization with Dual Statistical Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:20:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AGPO adaptively sets trust-region size and exploration temperature from group reward dispersion, entropy, and KL drift, yielding higher scores than PPO and GRPO on nine math benchmarks under fixed token budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22875","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RMA: an Agentic System for Research-Level Mathematical Problems","primary_cat":"cs.AI","submitted_at":"2026-05-20T04:54:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RMA, a multi-agent system with structured memory and iterative feedback loops, solves 8 out of 10 research-level math problems on the new First Proof benchmark and outperforms GPT-5.2R and Aletheia according to expert evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20654","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"REFLECTOR: Internalizing Step-wise Reflection against Indirect Jailbreak","primary_cat":"cs.LG","submitted_at":"2026-05-20T03:16:15+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22870","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Readout Shortcut: Positional Number Copying Dominates Arithmetic CoT Readout in Small Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-20T00:32:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In 1-3B instruction-tuned LMs on GSM8K, arithmetic CoT readout is dominated by positional copying of the trailing number before the answer delimiter, accounting for 54-92 percentage points of accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20548","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Do Agents Communicate? Characterizing Information Exchange in Multi-Agent Systems","primary_cat":"cs.MA","submitted_at":"2026-05-19T22:51:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Systematic study of inter-agent communication in LLM multi-agent systems shows reasoning and verification are critical for performance, with a new augmentation technique recovering 86.2% of failures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22869","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FuRA: Full-Rank Parameter-Efficient Fine-Tuning with Spectral Preconditioning","primary_cat":"cs.LG","submitted_at":"2026-05-19T22:11:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FuRA uses block tensor-train factorization with fixed pretrained SVD basis to achieve full-rank spectral preconditioning, outperforming Full FT by +1.37 on LLaMA-3-8B commonsense reasoning and surpassing QLoRA in quantized settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20520","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Open-World Evaluations for Measuring Frontier AI Capabilities","primary_cat":"cs.AI","submitted_at":"2026-05-19T21:42:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Open-world evaluations using qualitative review of real-world tasks can give earlier warnings of frontier AI capabilities than automated benchmarks, as demonstrated by an AI agent publishing a simple iOS app with one minor human fix.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20425","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AgentCo-op: Retrieval-Based Synthesis of Interoperable Multi-Agent Workflows","primary_cat":"cs.AI","submitted_at":"2026-05-19T19:22:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AgentCo-op retrieves and assembles existing agents and tools into interoperable workflows for open-world scientific tasks, showing effectiveness in genomics case studies and competitive benchmark results with lower costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20402","ref_index":9,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decomposing MXFP4 quantization error for LLM reinforcement learning: reducible bias, recoverable deadzone, and an irreducible floor","primary_cat":"cs.LG","submitted_at":"2026-05-19T18:59:26+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20369","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DEL: Digit Entropy Loss for Numerical Learning of Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T18:18:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DEL is a new loss for LLM numerical learning that applies supervised digit entropy optimization and extends to floating-point numbers, showing improved accuracy and distance metrics over prior methods on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20104","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Draft Less, Retrieve More: Hybrid Tree Construction for Speculative Decoding","primary_cat":"cs.LG","submitted_at":"2026-05-19T16:55:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Graft combines pruning and retrieval in a sequential mechanism to build hybrid draft trees for speculative decoding, delivering up to 5.41× speedup and 21.8% better average speedup than EAGLE-3 on large models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20033","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Nash Equilibrium Framework For Training-Free Multimodal Step Verification","primary_cat":"cs.CV","submitted_at":"2026-05-19T15:54:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A Nash equilibrium framework for training-free multimodal step verification that uses cross-modal agreement and disagreement signals for filtering and ranking reasoning steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19537","ref_index":25,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Silent Hyperparameter: Quantifying the Impact of Inference Backends on LLM Reproducibility","primary_cat":"cs.LG","submitted_at":"2026-05-19T08:37:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical study shows LLM inference backends can shift benchmark scores by up to 16.6 percentage points and cause output disagreements due to optimizations like prefix caching and custom kernels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19523","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Investigating Cross-Modal Skill Injection: Scenarios, Methods, and Hyperparameters","primary_cat":"cs.CL","submitted_at":"2026-05-19T08:24:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Systematic evaluation finds cross-modal skill injection via model merging succeeds in instruction-following and cross-lingual scenarios but fails in mathematical reasoning, with TA and DARE methods outperforming others after hyperparameter analysis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19418","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Conflict-Resilient Multi-Agent Reasoning via Signed Graph Modeling","primary_cat":"cs.AI","submitted_at":"2026-05-19T06:11:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIGMA builds a signed relational graph among LLM agents and uses conflict-aware message passing plus weighted aggregation to produce more consistent predictions than prior cooperative-assumption baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19416","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LambdaPO: A Lambda Style Policy Optimization for Reasoning Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T06:10:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LambdaPO introduces pairwise preference-based advantage estimation and a semantic density reward to extract more optimization signal from trajectory groups than GRPO's monolithic baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19282","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Muon Beyond Pretraining: Spectral Failures and High-Pass Remedies for VLA and RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-19T03:00:26+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pion modifies Muon's Newton-Schulz iterations into a controllable high-pass filter that anchors dominant singular values at 1 while suppressing noisy tails, outperforming Muon and AdamW in VLA and RLVR regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19228","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Diagnosing Multi-step Reasoning Failures in Black-box LLMs via Stepwise Confidence Attribution","primary_cat":"cs.CL","submitted_at":"2026-05-19T00:57:51+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19028","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning When to Adapt","primary_cat":"cs.LG","submitted_at":"2026-05-18T18:51:24+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DISeL augments standard LoRA with per-input gates over rank-one updates to reduce catastrophic forgetting during fine-tuning while adding few parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18753","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DashAttention: Differentiable and Adaptive Sparse Hierarchical Attention","primary_cat":"cs.CL","submitted_at":"2026-05-18T17:59:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DashAttention introduces differentiable adaptive sparse hierarchical attention via α-entmax block selection, achieving full-attention accuracy at 75% sparsity with improved Pareto performance over NSA and InfLLMv2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18643","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post-Trained MoE Can Skip Half Experts via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-18T16:50:48+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18549","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Monitoring the Internal Monologue: Probe Trajectories Reveal Reasoning Dynamics","primary_cat":"cs.CL","submitted_at":"2026-05-18T15:29:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Probe trajectories across token positions in LRMs, combined with signal-processing features, improve prediction of future model outputs over static probes on safety and math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}