{"total":17,"items":[{"citing_arxiv_id":"2605.11680","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ShapeCodeBench: A Renewable Benchmark for Perception-to-Program Reconstruction of Synthetic Shape Scenes","primary_cat":"cs.CV","submitted_at":"2026-05-12T07:39:00+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ShapeCodeBench introduces a renewable benchmark for perception-to-program reconstruction of synthetic shapes, with evaluations showing low exact-match performance from current models and heuristics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09012","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Re$^2$Math: Benchmarking Theorem Retrieval in Research-Level Mathematics","primary_cat":"cs.AI","submitted_at":"2026-05-09T15:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Re²Math is a new benchmark that evaluates AI models on retrieving and verifying the applicability of theorems from math literature to advance steps in partial proofs, accepting any sufficient theorem while controlling for leakage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08498","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MathConstraint: Automated Generation of Verified Combinatorial Reasoning Instances for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-08T21:28:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MathConstraint generates scalable, automatically verifiable combinatorial problems where LLMs achieve 18.5-66.9% accuracy without tools but roughly double that with solver access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07731","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Benchmarking EngGPT2-16B-A3B against Comparable Italian and International Open-source LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-08T13:36:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"EngGPT2MoE-16B-A3B matches or beats other Italian models on most international benchmarks but trails top international models such as GPT-5 nano and Qwen3-8B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05250","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Decision-aware User Simulation Agent for Evaluating Conversational Recommender Systems","primary_cat":"cs.IR","submitted_at":"2026-05-05T15:11:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hesitator is a theory-grounded simulator that separates utility-based item selection from overload-aware commitment decisions to reduce unrealistic high acceptance rates in conversational recommender evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02028","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Counting as a minimal probe of language model reliability","primary_cat":"cs.CL","submitted_at":"2026-05-03T19:27:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Language models have limited stable counting capacity well below context limits and rely on a finite set of count-like internal states, collapsing to guessing once exhausted.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21598","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"You Don't Need Public Tests to Generate Correct Code","primary_cat":"cs.SE","submitted_at":"2026-04-23T12:21:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DryRUN lets LLMs create their own test inputs and run internal simulations for self-correcting code generation, matching the performance of test-dependent methods like CodeSIM on LiveCodeBench without public tests or external signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10834","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLMs for Qualitative Data Analysis Fail on Security-specificComments in Human Experiments","primary_cat":"cs.SE","submitted_at":"2026-04-12T22:01:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs improve with detailed code descriptions but remain insufficient to replace human annotators for security-specific qualitative coding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10015","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FinTrace: Holistic Trajectory-Level Evaluation of LLM Tool Calling for Long-Horizon Financial Tasks","primary_cat":"cs.AI","submitted_at":"2026-04-11T03:58:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FinTrace supplies trajectory-level metrics for LLM financial tool calling, exposing gaps in information use and output quality, while its preference dataset enables DPO training that boosts intermediate metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09251","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DRBENCHER: Can Your Agent Identify the Entity, Retrieve Its Properties and Do the Math?","primary_cat":"cs.AI","submitted_at":"2026-04-10T12:07:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DRBENCHER generates multi-hop questions across biochemistry, finance, geophysics, security, and history that test interleaved browsing and computation, where the strongest models reach only 20% accuracy and human validation finds 76% validity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06755","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Babbling Suppression: Making LLMs Greener One Token at a Time","primary_cat":"cs.SE","submitted_at":"2026-04-08T07:21:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Babbling Suppression stops LLM code generation upon test passage to reduce token output and energy consumption by up to 65% across Python and Java benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04854","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Assessing Large Language Models for Stabilizing Numerical Expressions in Scientific Software","primary_cat":"cs.SE","submitted_at":"2026-04-06T16:57:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs match or exceed state-of-the-art traditional methods for stabilizing numerical expressions in scientific software, succeeding on 97.9% of expressions where baselines fail to improve accuracy, but struggle with control flow and high-precision literals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.26692","ref_index":108,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Kimi Linear: An Expressive, Efficient Attention Architecture","primary_cat":"cs.CL","submitted_at":"2025-10-30T16:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Kimi Linear hybridizes linear attention with a new KDA module to beat full attention on tasks while slashing KV cache by 75% and speeding decoding up to 6x.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.16941","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?","primary_cat":"cs.SE","submitted_at":"2025-09-21T06:28:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SWE-Bench Pro is a new benchmark with 1,865 long-horizon tasks from 41 repositories designed to evaluate AI agents on realistic enterprise-level software engineering problems beyond prior benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.09388","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Qwen3 Technical Report","primary_cat":"cs.CL","submitted_at":"2025-05-14T13:41:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pith review generated a malformed one-line summary.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.05236","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Unified Reward Model for Multimodal Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2025-03-07T08:36:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UnifiedReward is the first unified reward model that jointly assesses multimodal understanding and generation to provide better preference signals for aligning vision models via DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.01456","ref_index":128,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Process Reinforcement through Implicit Rewards","primary_cat":"cs.LG","submitted_at":"2025-02-03T15:43:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRIME enables online process reward model updates in LLM RL using implicit rewards from rollouts and outcome labels, yielding 15.1% average gains on reasoning benchmarks and surpassing a stronger instruct model with 10% of the data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}