{"total":14,"items":[{"citing_arxiv_id":"2605.13695","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RTLC -- Research, Teach-to-Learn, Critique: A three-stage prompting paradigm inspired by the Feynman Learning Technique that lifts LLM-as-judge accuracy on JudgeBench with no fine-tuning","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:48:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RTLC prompting lifts Claude 3.7 Sonnet pairwise accuracy on 350 hard JudgeBench items from 64.6% to 78.6% via a Research-Teach-Critique scaffold that beats self-consistency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10805","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reasoning Is Not Free: Robust Adaptive Cost-Efficient Routing for LLM-as-a-Judge","primary_cat":"cs.AI","submitted_at":"2026-05-11T16:30:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RACER routes between reasoning and non-reasoning LLM judges via constrained distributionally robust optimization to achieve better accuracy-cost trade-offs under distribution shift.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07699","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DRIP-R: A Benchmark for Decision-Making and Reasoning Under Real-World Policy Ambiguity in the Retail Domain","primary_cat":"cs.CL","submitted_at":"2026-05-08T13:10:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DRIP-R is a new benchmark showing that frontier LLMs systematically disagree on how to resolve identical ambiguous retail policy scenarios, highlighting ambiguity as a core challenge for agent decision-making.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07461","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Think-with-Rubrics: From External Evaluator to Internal Reasoning Guidance","primary_cat":"cs.CL","submitted_at":"2026-05-08T09:08:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Think-with-Rubrics has LLMs generate rubrics internally before responding, outperforming external rubric-as-reward baselines by 3.87 points on average across benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27727","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM-as-a-Judge for Human-AI Co-Creation: A Reliability-Aware Evaluation Framework for Coding","primary_cat":"cs.SE","submitted_at":"2026-04-30T11:20:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM judges for human-AI coding co-creation show moderate performance (ROC-AUC 0.59) and low agreement, with co-creation success concentrating early in interactions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26235","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LATTICE: Evaluating Decision Support Utility of Crypto Agents","primary_cat":"cs.CR","submitted_at":"2026-04-29T02:32:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LATTICE is a scalable LLM-judge benchmark for crypto agent decision support that reveals performance trade-offs among real-world copilots across dimensions and tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26020","ref_index":68,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Training Computer Use Agents to Assess the Usability of Graphical User Interfaces","primary_cat":"cs.CL","submitted_at":"2026-04-28T18:04:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"uxCUA is a trained computer use agent that assesses GUI usability more accurately than larger models by learning to prioritize and execute important user interactions on labeled interface datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24700","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Green Shielding: A User-Centric Approach Towards Trustworthy AI","primary_cat":"cs.CL","submitted_at":"2026-04-27T17:04:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Green Shielding introduces CUE criteria and the HCM-Dx benchmark to demonstrate that routine prompt variations systematically alter LLM diagnostic behavior along clinically relevant dimensions, producing Pareto-like tradeoffs in plausibility versus coverage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23178","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Judging the Judges: A Systematic Evaluation of Bias Mitigation Strategies in LLM-as-a-Judge Pipelines","primary_cat":"cs.AI","submitted_at":"2026-04-25T07:18:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Style bias dominates LLM-as-a-Judge systems far more than position bias, with debiasing strategies providing model-dependent gains and public tools released for replication.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22597","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethinking Math Reasoning Evaluation: A Robust LLM-as-a-Judge Framework Beyond Symbolic Rigidity","primary_cat":"cs.AI","submitted_at":"2026-04-24T14:25:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An LLM-as-a-judge evaluation framework for math reasoning outperforms symbolic methods by accurately assessing diverse answer representations and formats.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16790","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bias in the Loop: Auditing LLM-as-a-Judge for Software Engineering","primary_cat":"cs.SE","submitted_at":"2026-04-18T02:35:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM judges for code tasks show high sensitivity to prompt biases that systematically favor certain options, changing accuracy and model rankings even when code is unchanged.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12312","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CompliBench: Benchmarking LLM Judges for Compliance Violation Detection in Dialogue Systems","primary_cat":"cs.CL","submitted_at":"2026-04-14T05:42:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CompliBench uses simulation and adversarial flaw injection to create labeled dialogue data showing that top proprietary LLMs perform poorly at spotting guideline violations while fine-tuned smaller models outperform them and generalize to new domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03742","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Structured Multi-Criteria Evaluation of Large Language Models with Fuzzy Analytic Hierarchy Process and DualJudge","primary_cat":"cs.AI","submitted_at":"2026-04-04T14:07:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fuzzy AHP and DualJudge deliver more stable and calibrated LLM evaluations than direct scoring by breaking assessments into explicit criteria and adaptively fusing intuitive and deliberative judgments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.05579","ref_index":220,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods","primary_cat":"cs.CL","submitted_at":"2024-12-07T08:07:24+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey that organizes LLMs-as-judges research into functionality, methodology, applications, meta-evaluation, and limitations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[28], AttrScore [276], PHUDGE [47], ECT [229], SELF-J [266], SorryBench [249], TIGERScore [99],FENCE [252], ARES [188] Preference-basedLearning Meta-Rewarding [245], Con-J [270], JudgeLM [301], INSTRUCTSCORE [258], AUTO-J [130], Shepherd [232],X-EVAL [142], Themis [88], CritiqueLLM [106], FedEval-LLM [84], PandaLM [236], Self-Taught [231],FLAMe [226], Self-Rationalization [220], CompassJudger-1 [20], Zhou et al. [294], HALU-J [227],PROMETHEUS [109], PROMETHEUS 2 [110], PROMETHEUS-VISION [122], LLaVA-Critic [253] Post-processing(§4.1.3) ProbabilityCalibration Daynauth et al. [45], ProbDiff [247], PoE [150], CRISPR [264] Text ReprocessingSottana et al. [206], AUTO-J [130], Yan et al. [262], Tessler et al. [214], REVISEVAL [281]Ren et al."}],"limit":50,"offset":0}