{"total":17,"items":[{"citing_arxiv_id":"2607.01740","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Meta-Benchmarks for Financial-Services LLM Evaluation","primary_cat":"cs.AI","submitted_at":"2026-07-02T05:52:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A meta-benchmarking framework organizes 452 LLM benchmarks into 41 O*NET Generalized Work Activities and 38 BIAN domains, using discrimination-coverage-recency weights to scale K-factors in an Elo tournament for comparable financial-services scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30989","ref_index":97,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Wait, am I Being Fair? Characterizing Deductive Stereotyping and Mitigating It with Fair-GCG","primary_cat":"cs.CL","submitted_at":"2026-06-30T00:00:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper characterizes deductive stereotyping in LLMs and introduces Fair-GCG to discover injection phrases that improve fairness across benchmarks, reasoning, and real-world tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29815","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SrDetection: A Self-Referential Framework for Data Leakage Detection in Code Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-29T05:48:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SrDetection detects data leakage in Code LLMs via contrast between original benchmark samples and their semantic variants, reporting F1 gains of 21.52 (gray-box) and 14.46 (black-box) over baselines in a controlled testbed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17683","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bridging Functional Correctness and Runtime Efficiency Gaps in LLM-Based Code Translation","primary_cat":"cs.CL","submitted_at":"2026-06-16T08:49:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SwiftTrans improves both functional correctness and runtime efficiency of LLM code translations via multi-perspective exploration with hierarchical guidance and difference-aware selection with ordinal guidance on extended benchmarks including new SwiftBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11337","ref_index":132,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Can AI Agents Synthesize Scientific Conclusions?","primary_cat":"cs.AI","submitted_at":"2026-06-09T18:16:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new benchmark and clean-room harness show frontier AI agents reach only 0.337 factual F1 when synthesizing conclusions from scientific evidence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22368","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VeriScale: Adversarial Test-Suite Scaling for Verifiable Code Generation","primary_cat":"cs.LG","submitted_at":"2026-05-21T12:00:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VeriScale adversarially scales test suites for the Verina benchmark into VerinaPlus (83x larger) and VerinaLite (14x variant) that expose hidden LLM weaknesses on SpecGen and CodeGen tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21930","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PITMuS: A Tool for Automated Bug Dataset Generation via Source-Level Mutant Reconstruction","primary_cat":"cs.SE","submitted_at":"2026-05-21T02:59:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PITMuS automates source-level bug dataset generation by mapping PIT bytecode mutants back to Java source using debug information, producing structured pairs and metadata evaluated on eight open-source systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23965","ref_index":56,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LGMT: Logic-Grounded Metamorphic Testing for Evaluating the Reasoning Reliability of LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-12T18:26:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LGMT is a logic-grounded metamorphic testing framework that detects hidden reasoning defects in LLMs by checking consistency on semantically invariant inputs derived from FOL equivalences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06327","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Measuring Evaluation-Context Divergence in Open-Weight LLMs: A Paired-Prompt Protocol with Pilot Evidence of Alignment-Pipeline-Specific Heterogeneity","primary_cat":"cs.CL","submitted_at":"2026-05-07T14:23:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new paired-prompt protocol reveals alignment-pipeline-specific heterogeneity in how open-weight LLMs respond to evaluation versus deployment framings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22871","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AutoRISE: Agent-Driven Strategy Evolution for Red-Teaming Large Language Models","primary_cat":"cs.CR","submitted_at":"2026-04-23T19:37:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AutoRISE evolves red-teaming attack strategies as editable executable programs via an agent, yielding 17-point higher average attack success rates than baselines across 11 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21255","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Agents Look the Same: Quantifying Distillation-Induced Similarity in Tool-Use Behaviors","primary_cat":"cs.CL","submitted_at":"2026-04-23T03:48:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"New RPS and AGS metrics show within-family distilled LLM agents have 5.9 pp higher tool-use graph similarity than cross-family pairs, with some models exceeding their teachers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17771","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SPENCE: A Syntactic Probe for Detecting Contamination in NL2SQL Benchmarks","primary_cat":"cs.CL","submitted_at":"2026-04-20T03:50:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPENCE shows older NL2SQL benchmarks like Spider have high performance sensitivity to syntactic changes, indicating likely training contamination, while newer ones like BIRD show little sensitivity and appear largely clean.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15203","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MADE: A Living Benchmark for Multi-Label Text Classification with Uncertainty Quantification of Medical Device Adverse Events","primary_cat":"cs.CL","submitted_at":"2026-04-16T16:28:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MADE creates a contamination-resistant living benchmark for multi-label classification of medical device adverse events, with evaluations revealing model-specific trade-offs in accuracy and uncertainty quantification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.02996","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Large Reasoning Models Are (Not Yet) Multilingual Latent Reasoners","primary_cat":"cs.CL","submitted_at":"2026-01-06T13:20:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Large reasoning models exhibit multilingual latent reasoning that is uneven across languages but internally consistent and English-centered.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.15746","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLMs Judge Themselves: A Game-Theoretic Framework for Human-Aligned Evaluation","primary_cat":"cs.CL","submitted_at":"2025-10-17T15:34:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A mutual evaluation system for LLMs that uses game-theoretic aggregation of peer reviews and validates alignment with human voting on subjective outputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.06226","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GeoLaux: A Benchmark for Evaluating MLLMs' Geometry Performance on Long-Step Problems Requiring Auxiliary Lines","primary_cat":"cs.AI","submitted_at":"2025-08-08T11:11:37+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GeoLaux is a new benchmark of 2186 long-step geometry problems requiring auxiliary lines, used to evaluate 23 MLLMs and reveal major drops in performance on complex tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.07985","ref_index":72,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Omni-MATH: A Universal Olympiad Level Mathematic Benchmark For Large Language Models","primary_cat":"cs.CL","submitted_at":"2024-10-10T14:39:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Omni-MATH supplies 4428 human-verified Olympiad math problems that expose top LLMs achieving only 52.55% to 60.54% accuracy on the most difficult items.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}