{"total":20,"items":[{"citing_arxiv_id":"2605.12421","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Formalize, Don't Optimize: The Heuristic Trap in LLM-Generated Combinatorial Solvers","primary_cat":"cs.AI","submitted_at":"2026-05-12T17:15:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM-generated combinatorial solvers achieve highest correctness when the model formalizes problems for verified backends rather than attempting to optimize search, which often causes regressions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09678","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Absurd World: A Simple Yet Powerful Method to Absurdify the Real-world for Probing LLM Reasoning Capabilities","primary_cat":"cs.AI","submitted_at":"2026-05-10T17:55:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Absurd World automatically converts real-world problems into absurd yet logically coherent scenarios to test whether LLMs can reason without depending on familiar patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08498","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MathConstraint: Automated Generation of Verified Combinatorial Reasoning Instances for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-08T21:28:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MathConstraint generates scalable, automatically verifiable combinatorial problems where LLMs achieve 18.5-66.9% accuracy without tools but roughly double that with solver access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07776","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Tracing Uncertainty in Language Model \"Reasoning\"","primary_cat":"cs.LG","submitted_at":"2026-05-08T14:16:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Uncertainty trace profiles from LM reasoning traces predict correct final answers with AUROC up to 0.807 and enable early error detection using only initial tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08221","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NoisyCoconut: Counterfactual Consensus via Latent Space Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-06T13:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Injecting noise into LLM latent trajectories creates diverse reasoning paths whose agreement acts as a confidence signal for selective abstention, cutting error rates from 40-70% to under 15% on math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03441","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Exposing LLM Safety Gaps Through Mathematical Encoding:New Attacks and Systematic Analysis","primary_cat":"cs.CR","submitted_at":"2026-05-05T07:25:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Harmful prompts reformulated as coherent mathematical problems bypass LLM safety mechanisms at 46-56% rates, with success depending on deep reformulation rather than mere notation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02442","ref_index":140,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Measuring AI Reasoning: A Guide for Researchers","primary_cat":"cs.AI","submitted_at":"2026-05-04T10:42:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Reasoning in language models should be measured by the faithfulness and validity of their multi-step search processes and intermediate traces, not final-answer accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02363","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Correct Isn't Usable: Improving Structured Output Reliability in Small Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-04T09:07:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AloLab, an iterative meta-agent prompt optimizer, raises structured output accuracy for 7-9B models from 0% to 84-87% on GSM8K while preserving near-native inference speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25444","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"One Refiner to Unlock Them All: Inference-Time Reasoning Elicitation via Reinforcement Query Refinement","primary_cat":"cs.CL","submitted_at":"2026-04-28T09:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReQueR trains a single RL-based query refiner with an adaptive curriculum to decompose raw queries into structured logic, delivering 1.7-7.2% absolute gains on reasoning tasks across diverse LLMs and generalizing to unseen models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22597","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethinking Math Reasoning Evaluation: A Robust LLM-as-a-Judge Framework Beyond Symbolic Rigidity","primary_cat":"cs.AI","submitted_at":"2026-04-24T14:25:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An LLM-as-a-judge evaluation framework for math reasoning outperforms symbolic methods by accurately assessing diverse answer representations and formats.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21027","ref_index":90,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","primary_cat":"cs.AI","submitted_at":"2026-04-22T19:18:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HypEHR is a hyperbolic embedding model for EHR data that uses Lorentzian geometry and hierarchy-aware pretraining to answer clinical questions nearly as well as large language models but with much smaller size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17842","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"QuickScope: Certifying Hard Questions in Dynamic LLM Benchmarks","primary_cat":"cs.CL","submitted_at":"2026-04-20T05:51:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QuickScope uses modified COUP Bayesian optimization to find truly difficult questions in dynamic LLM benchmarks more sample-efficiently than baselines while cutting false positives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16646","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agentic Frameworks for Reasoning Tasks: An Empirical Study","primary_cat":"cs.AI","submitted_at":"2026-04-17T19:02:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An empirical evaluation of 22 agentic frameworks on BBH, GSM8K, and ARC benchmarks shows stable performance in 12 frameworks but highlights orchestration failures and weaker mathematical reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22819","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A pragmatic approach to regulating AI agents","primary_cat":"cs.CY","submitted_at":"2026-04-16T13:04:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AI agents require distinct regulation as AI systems under the EU AI Act with orchestration-layer oversight and a risk-based traffic light authorization system in contract law to preserve human accountability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11535","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Problem Reductions at Scale: Agentic Integration of Computationally Hard Problems","primary_cat":"cs.AI","submitted_at":"2026-04-13T14:32:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A harness for AI agents enabled construction of a Rust library with 100+ problem types and 200+ reduction rules for NP-hard problems in three months.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07593","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Too long; didn't solve","primary_cat":"cs.AI","submitted_at":"2026-04-08T20:51:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Longer prompts and solutions in a new expert-authored math dataset correlate with higher failure rates across LLMs, with length linked to empirical difficulty after difficulty adjustment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02863","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EMS: Multi-Agent Voting via Efficient Majority-then-Stopping","primary_cat":"cs.AI","submitted_at":"2026-04-03T08:29:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EMS reduces the average number of agents invoked for majority voting by 32% via reliability-aware prioritization and early stopping on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09679","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Heterogeneous Consensus-Progressive Reasoning for Efficient Multi-Agent Debate","primary_cat":"cs.MA","submitted_at":"2026-04-03T06:58:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HCP-MAD reduces token costs in multi-agent debates by using heterogeneous consensus verification, adaptive pair-agent stopping, and escalated collective voting based on task complexity signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.29025","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Model Says Walk: How Surface Heuristics Override Implicit Constraints in LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-03-30T21:36:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs prioritize surface heuristics such as distance cues over implicit constraints in reasoning tasks, with the new HOB benchmark showing no model exceeds 75% strict accuracy and hints recovering performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16392","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RoMathExam: A Longitudinal Dataset of Romanian Math Exams (1895-2025) with a Seven-Decade Core (1957-2025)","primary_cat":"cs.CY","submitted_at":"2026-03-28T13:29:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RoMathExam supplies a century-long collection of Romanian math exams together with a new intrinsic complexity metric that correlates across frontier models at r > 0.72.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}