{"total":13,"items":[{"citing_arxiv_id":"2605.23629","ref_index":15,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DDX-TRACE: A Benchmark for Medical Diagnostic Trajectories in VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-22T13:41:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DDX-TRACE is a physician-adjudicated benchmark for evaluating VLMs on evidence-supported diagnostic trajectories rather than final answers alone in multimodal neuroradiology.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21949","ref_index":29,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Claim-Selective Certification for High-Risk Medical Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-05-21T03:29:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Claim-selective certification decomposes medical RAG responses into verifiable claims scored against retrieved evidence and mapped via an intent-aware selector to actions, reporting zero UCCR and action accuracy of 0.92 on dev and 0.90 on test.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21630","ref_index":17,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MindLoom: Composing Thought Modes for Frontier-Level Reasoning Data Synthesis","primary_cat":"cs.AI","submitted_at":"2026-05-20T18:40:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MindLoom synthesizes frontier-level reasoning data by decomposing solutions into thought mode chains, training a retrieval model for mode selection, composing new problems with distribution-aligned sampling, and applying rollout-based difficulty labeling for fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20176","ref_index":41,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"ClinSeekAgent: Automating Multimodal Evidence Seeking for Agentic Clinical Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-19T17:58:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClinSeekAgent automates active multimodal evidence seeking for clinical reasoning, improving LLM performance on raw EHR and CXR tasks while enabling distillation into smaller models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16215","ref_index":28,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Fully Open Meditron: An Auditable Pipeline for Clinical LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-15T17:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Presents the first fully open pipeline for clinical LLMs by unifying eight public QA datasets with three clinician-vetted synthetic extensions and applying it to five base models to achieve benchmark gains while maintaining auditability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13936","ref_index":29,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Towards the Next Frontier of LLMs, Training on Private Data: A Cross-Domain Benchmark for Federated Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-13T16:20:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Federated PEFT on LLMs across healthcare and finance datasets performs close to centralized training and beats isolated local training under non-IID conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13542","ref_index":13,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation","primary_cat":"cs.AI","submitted_at":"2026-05-13T13:52:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"RealICU is a new benchmark using physician hindsight labels on MIMIC-IV ICU data that exposes LLM failures in long-horizon clinical assessment, acute problem detection, action recommendation, and red-flag identification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18827","ref_index":20,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Code-Guided Reasoning for Small Language Models: Evaluating Executable MCQA Scaffolds","primary_cat":"cs.IR","submitted_at":"2026-05-12T20:20:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Code-Guided Reasoning protocol reports a 28 percentage-point macro accuracy gain for small language models on MCQA when using generated executable Python scaffolds versus direct answering on 20k+ items.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10064","ref_index":10,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MAGE: Multi-Agent Self-Evolution with Co-Evolutionary Knowledge Graphs","primary_cat":"cs.AI","submitted_at":"2026-05-11T06:39:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MAGE uses a four-subgraph co-evolutionary knowledge graph plus dual bandits to externalize and retrieve experience for stable self-evolution of frozen language-model agents, showing gains on nine diverse benchmarks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"We evaluate MAGEon nine benchmarks drawn from six task families.GSM8K[ 7] andReal- Math[ 31] test mathematical reasoning.HotpotQA[ 26] andWebQA[ 3] test multi-hop and open-domain factoid QA.STBench[ 12] tests spatio-temporal analysis across twenty-seven task types.FinQA[ 6] tests table-grounded numerical reasoning over financial reports, andMedQA- USMLE[ 10] tests medical multiple-choice reasoning.Crafter[ 9] is an open-world sequential survival game, andWebShop[ 27] is a web-navigation task with parameterised actions. FinQA and MedQA-USMLE are used only as standardized reasoning benchmarks; MAGEis not evaluated or proposed as a financial-advice or clinical decision-support system. Evaluation protocol and leakage control."},{"citing_arxiv_id":"2605.02240","ref_index":15,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"PhysicianBench: Evaluating LLM Agents in Real-World EHR Environments","primary_cat":"cs.AI","submitted_at":"2026-05-04T05:32:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"PhysicianBench is a new benchmark of 100 physician-reviewed, execution-grounded tasks in live EHR environments where the best LLM agent reaches only 46% success and open-source models reach 19%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24700","ref_index":18,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Green Shielding: A User-Centric Approach Towards Trustworthy AI","primary_cat":"cs.CL","submitted_at":"2026-04-27T17:04:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Green Shielding introduces CUE criteria and the HCM-Dx benchmark to demonstrate that routine prompt variations systematically alter LLM diagnostic behavior along clinically relevant dimensions, producing Pareto-like tradeoffs in plausibility versus coverage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23801","ref_index":4,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Domain Fine-Tuning vs. Retrieval-Augmented Generation for Medical Multiple-Choice Question Answering: A Controlled Comparison at the 4B-Parameter Scale","primary_cat":"cs.CL","submitted_at":"2026-04-26T16:49:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Domain fine-tuning of a 4B LLM yields a statistically significant 6.8 pp accuracy gain on MedQA-USMLE over a general baseline, while RAG over medical explanations produces no significant improvement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02368","ref_index":7,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Xpertbench: Expert Level Tasks with Rubrics-Based Evaluation","primary_cat":"cs.AI","submitted_at":"2026-03-27T11:28:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"XpertBench provides 1,346 rubric-scored expert tasks showing leading LLMs achieve a maximum ~66% success rate and ~55% mean score across domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}