{"total":34,"items":[{"citing_arxiv_id":"2607.02007","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EduArt: An educational-level benchmark for evaluating art history knowledge in large language models","primary_cat":"cs.CL","submitted_at":"2026-07-02T10:43:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EduArt is a new benchmark of 871 educational questions that reveals multimodal LLMs perform near ceiling on multiple-choice art history items but drop sharply on open completion and error identification tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31432","ref_index":3,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Clinically Structured Rank-Gated LoRA for Cross-Benchmark Medical Question Answering","primary_cat":"cs.CL","submitted_at":"2026-06-30T09:59:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BiRG-LoRA achieves 69.31% macro-average accuracy across CMB, CMExam, MedQA, and MedMCQA, outperforming MoELoRA by 0.89 points with 28.1% fewer trainable parameters under a matched Qwen3-8B protocol.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29375","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TriageRA-CCF: Source-Side Clinical Confidence and Coverage Signals for Adaptive Rank Budgeting in Medical LLMs","primary_cat":"cs.CL","submitted_at":"2026-06-28T12:52:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TriageRA-CCF combines source-side confidence, coverage, and counterfactual signals to supervise an adaptive LoRA rank router, reporting modest average accuracy gains over LoRA/DoRA/MoELoRA baselines on two 8B models under matched training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21023","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Demystifying Numerical Instability in LLM Inference: Achieving Reproducible Inference for Mission-Critical Tasks with HEAL","primary_cat":"cs.LG","submitted_at":"2026-06-19T01:21:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HEAL restores FP32-level output reproducibility in 16-bit LLM inference using targeted INT16 quantization and algebraic compensation, cutting overhead by up to 7.1x versus full FP32 on the new MCR-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19396","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BioHarness: Substrate-Aware Evidence Assembly for Biomedical Question Answering across Literature, Knowledge Bases, and Biological Atlases","primary_cat":"q-bio.QM","submitted_at":"2026-06-17T06:25:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BioHarness improves pooled biomedical QA score from 65.9 to 71.0 on 19,302 items by using staged, substrate-aware evidence assembly that escalates only when needed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10385","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Absolute Imitation: Anchored Residual Guidance for Privileged On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-09T03:51:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AR-OPD disentangles privileged supervision via anchored residual guidance to reduce hindsight leakage in on-policy distillation, reporting gains of 2.3 points over full privileged OPD and 7.9 over SFT on reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07853","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond English benchmarks: clinical llm evaluation in Brazilian Portuguese","primary_cat":"cs.CL","submitted_at":"2026-06-05T21:29:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Creates the first bilingual clinical benchmark from Brazilian cases and reports that English performance advantage exists only in diagnosis retrieval, disappearing in the other three tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03305","ref_index":14,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Reliability Gap in Benchmark Auditing: Distribution Shift and Scale as Failure Modes of Contamination Detection","primary_cat":"cs.AI","submitted_at":"2026-06-02T08:21:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical evaluation across 25 LLMs shows contamination detection methods achieve correct outcomes in only 201 of 335 cases, exposing failure modes from distribution shift and benchmark scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02245","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Knowledge Is Not Free: Cost-Aware Evidence Selection in Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-06-01T13:39:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Defines cost-aware RAG with evidence cost tiers and shows static selectors are brittle while agentic LLM-based selection is promising but model-dependent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00123","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CardioLens: Revealing the Clinical Reality Gap of MLLMs via Multi-Sequence Cardiac MRI Evaluations","primary_cat":"cs.CV","submitted_at":"2026-05-28T11:03:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CardioLens is a leakage-resistant CMR testbed of 473k slices and 13k QA pairs showing current MLLMs exhibit a large clinical reality gap with category-collapse failures on real workflows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27860","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"C-MIG: Multi-view Information Gain-based Retrieval-Augmented Generation for Clinical Diagnosis Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-27T02:20:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"C-MIG uses multi-view information gain from retrieved documents and refinements to supervise RAG-RL for clinical diagnosis, claiming top performance on four medical benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28332","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Medical Safety Alignment Fails: A Benchmark for Evaluating LLMs on High-Risk Medical Queries","primary_cat":"cs.CY","submitted_at":"2026-05-26T14:39:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedHarm benchmark shows aligned LLMs and guardrails can still produce unsafe responses on high-risk medical queries, indicating medical safety requires domain-specific testing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19028","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning When to Adapt","primary_cat":"cs.LG","submitted_at":"2026-05-18T18:51:24+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DISeL augments standard LoRA with per-input gates over rank-one updates to reduce catastrophic forgetting during fine-tuning while adding few parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18930","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OEP: Poisoning Self-Evolving LLM Agents via Locally Correct but Non-Transferable Experiences","primary_cat":"cs.CR","submitted_at":"2026-05-18T14:08:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OEP poisons self-evolving LLM agents by constructing clean edge-case experiences that appear locally valid yet cause harmful over-generalization during reflection, achieving over 50% attack success rate on GPT-4o agents across three domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16679","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CHI-Bench: Can AI Agents Automate End-to-End, Long-Horizon, Policy-Rich Healthcare Workflows?","primary_cat":"cs.CL","submitted_at":"2026-05-15T22:34:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CHI-Bench shows current AI agents achieve at most 28% success on long-horizon healthcare workflows that require dense policy adherence, multi-role handoffs, and multi-turn interactions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15000","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Quantifying and Mitigating Premature Closure in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-14T16:02:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Frontier LLMs exhibit premature closure by selecting answers at high rates on medical tasks where the correct choice was removed and on open-ended queries, with safety prompting reducing but not eliminating the behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12882","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CiteVQA: Benchmarking Evidence Attribution for Trustworthy Document Intelligence","primary_cat":"cs.CL","submitted_at":"2026-05-13T01:54:42+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"CiteVQA requires models to cite specific document regions with bounding boxes alongside answers and finds that even the strongest MLLMs frequently cite the wrong region, with top SAA scores of only 76.0 for closed models and 22.5 for open-source ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11533","ref_index":9,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Checkup2Action: A Multimodal Clinical Check-up Report Dataset for Patient-Oriented Action Card Generation","primary_cat":"cs.CL","submitted_at":"2026-05-12T04:58:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Checkup2Action is a new multimodal dataset and benchmark for generating safe, prioritized action cards from real-world clinical check-up reports using large language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10025","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Medical Incident Causal Factors and Preventive Measures Generation Using Tag-based Example Selection in Few-shot Learning","primary_cat":"cs.CL","submitted_at":"2026-05-11T05:49:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Tag-based few-shot selection yields higher precision and stability than random or similarity-based methods when using LLMs to analyze medical incidents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07096","ref_index":16,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Query-efficient model evaluation using cached responses","primary_cat":"cs.LG","submitted_at":"2026-05-08T01:24:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DKPS-based methods predict new model benchmark scores using cached responses, matching baseline mean absolute error with substantially fewer queries and an offline query selection approach.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07058","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MedExAgent: Training LLM Agents to Ask, Examine, and Diagnose in Noisy Clinical Environments","primary_cat":"cs.CL","submitted_at":"2026-05-08T00:12:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedExAgent models clinical diagnosis as a POMDP with patient and exam noise, then uses supervised fine-tuning followed by DAPO optimization to train an agent that matches larger models on diagnostic accuracy while controlling exam costs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"diagnostic care remains uneven across populations and geographies [ 27, 1, 14]. These pressures have motivated longstanding interest in computational support for clinical reasoning [38], and recent advances in large language models (LLMs) have brought this goal within closer reach. Medical LLMs now show strong performance on knowledge-intensive benchmarks such as MedQA [ 17] and MedMCQA [29], and a growing line of medical specialists [34, 22, 36, 7] are trained for these tasks; more recent diagnostic evaluations move closer to clinical use by asking models to reason over complete clinical cases [45, 9, 13]. Across this landscape, the model is handed a complete case description and asked to reason over it; however, in clinical practice, the case must be constructed"},{"citing_arxiv_id":"2605.04180","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MedFabric and EtHER: A Data-Centric Framework for Word-Level Fabrication Generation and Detection in Medical LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-05T18:19:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MedFabric dataset and EtHER detector achieve over 15% better word-level fabrication detection in medical LLMs than prior methods by generating stylistically faithful errors and using decomposition-based checking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01048","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Compared to What? Baselines and Metrics for Counterfactual Prompting","primary_cat":"cs.CL","submitted_at":"2026-05-01T19:23:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Counterfactual prompting effects on LLMs are often indistinguishable from those caused by meaning-preserving paraphrases, causing most previously reported demographic sensitivities to disappear under proper statistical comparison.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"on established tools and methods to handle these issues in the LLM setting. Several approaches have been proposed for causal inference under multiple versions of treatment (VanderWeele & Hern 'an, 2013): (1) Refine the intervention until it is sufficiently well- specified that treatment variation irrelevance holds; (2) Systematically vary the realization and show that the effect is stable across versions; or, (3) Establish conditions under which a meaningful average effect across versions can still be identified. We pursue a version of (3). Specifically, we aim to directly quantify the contribution of the linguistic carrier to the observed effect by determining if the targeted variable's effect exceeds what benign textual modification alone would produce."},{"citing_arxiv_id":"2604.26048","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BioGraphletQA: Knowledge-Anchored Generation of Complex QA Datasets","primary_cat":"cs.CL","submitted_at":"2026-04-28T18:33:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A graphlet-anchored framework generates 119,856 factually grounded biomedical QA pairs that improve accuracy on PubMedQA and MedQA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17691","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SafeAnchor: Preventing Cumulative Safety Erosion in Continual Domain Adaptation of Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-20T01:13:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SafeAnchor preserves 93.2% of original safety alignment across sequential domain adaptations by anchoring low-rank safety subspaces and constraining orthogonal updates, while matching unconstrained fine-tuning performance within 1.5 points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16826","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Crowded in B-Space: Calibrating Shared Directions for LoRA Merging","primary_cat":"cs.CL","submitted_at":"2026-04-18T04:33:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pico reduces LoRA merge interference by calibrating over-shared directions in the B matrix before merging, yielding 3.4-8.3 point accuracy gains and sometimes beating joint training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07274","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Systematic Study of Retrieval Pipeline Design for Retrieval-Augmented Medical Question Answering","primary_cat":"cs.CL","submitted_at":"2026-04-08T16:37:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Dense retrieval plus query reformulation and reranking reaches 60.49% accuracy on MedQA USMLE, outperforming other setups while domain-specialized models make better use of the retrieved evidence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06154","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Exclusive Unlearning","primary_cat":"cs.CL","submitted_at":"2026-04-07T17:54:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Exclusive Unlearning makes LLMs safe by forgetting all but retained domain knowledge, protecting against jailbreaks while preserving useful responses in areas like medicine and math.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06262","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Exposure to Internalization: Dual-Stream Calibration for In-context Clinical Reasoning","primary_cat":"q-bio.QM","submitted_at":"2026-04-07T01:59:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Dual-Stream Calibration uses entropy minimization and iterative meta-learning at test time to internalize clinical evidence and outperform standard in-context learning baselines on medical tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.08804","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MOSAIC: Multi-agent Orchestration for Task-Intelligent Scientific Coding","primary_cat":"cs.CL","submitted_at":"2025-10-09T20:35:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MOSAIC is a training-free multi-agent LLM framework with rationale, coding, reflection, and debugging agents plus a consolidated context window that outperforms prior methods on scientific coding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.25346","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SynthPert: Enhancing LLM Biological Reasoning via Synthetic Reasoning Traces for Cellular Perturbation Prediction","primary_cat":"cs.AI","submitted_at":"2025-09-29T18:02:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SynthPert fine-tunes LLMs using synthetic reasoning traces to reach state-of-the-art on the PerturbQA benchmark for cellular perturbation prediction, surpassing the generating frontier model while generalizing to unseen cell types with only 2% of filtered data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.12778","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HeteroRAG: A Heterogeneous Retrieval-Augmented Generation Framework for Medical Vision Language Tasks","primary_cat":"cs.CL","submitted_at":"2025-08-18T09:54:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HeteroRAG integrates modality-specific retrieval from medical reports and multi-corpus text sources with preference tuning to improve factual accuracy in Med-LVLMs across 11 datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.16155","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PRIMETIME : Limits of LLMs in Temporal Primitives","primary_cat":"cs.NE","submitted_at":"2025-04-22T17:52:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PRIMETIME generator reveals that LLM datetime parsing and arithmetic primitives are individually unreliable but fully learnable via fine-tuning, enabling frontier-level accuracy on event planning with small LoRA models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.02079","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Argumentative Large Language Models for Explainable and Contestable Claim Verification","primary_cat":"cs.CL","submitted_at":"2024-05-03T13:12:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ArgLLMs build argumentation frameworks from LLMs to support explainable and contestable formal reasoning for claim verification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}