{"total":25,"items":[{"citing_arxiv_id":"2605.12384","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scalable Token-Level Hallucination Detection in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T16:47:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TokenHD uses a scalable data synthesis engine and importance-weighted training to create token-level hallucination detectors that work on free-form text and scale from 0.6B to 8B parameters, outperforming larger reasoning models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11163","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Benchmarking LLM-Based Static Analysis for Secure Smart Contract Development: Reliability, Limitations, and Potential Hybrid Solutions","primary_cat":"cs.CR","submitted_at":"2026-05-11T19:10:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs for smart contract security analysis show lexical bias from identifier names causing high false positives, with prompting creating precision-recall trade-offs, positioning them as complements rather than replacements for static analysis tools.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05134","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Low-Cost Black-Box Detection of LLM Hallucinations via Dynamical System Prediction","primary_cat":"cs.LG","submitted_at":"2026-05-06T17:07:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A single-pass black-box method models LLM outputs as dynamical systems via Koopman operators to detect hallucinations with claimed state-of-the-art accuracy and lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04845","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Agentic Repository Mining: A Multi-Task Evaluation","primary_cat":"cs.SE","submitted_at":"2026-05-06T12:43:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM agents dynamically exploring repositories via bash commands achieve competitive accuracy to context-provided LLMs across four classification tasks, with superior robustness to artifact size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01047","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLM Ghostbusters: Surgical Hallucination Suppression via Adaptive Unlearning","primary_cat":"cs.CR","submitted_at":"2026-05-01T19:20:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adaptive Unlearning suppresses package hallucinations in code-generating LLMs by 81% while preserving benchmark performance, using model-generated data and no human labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01011","ref_index":30,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CLEAR: Revealing How Noise and Ambiguity Degrade Reliability in LLMs for Medicine","primary_cat":"cs.CL","submitted_at":"2026-05-01T18:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLEAR reveals that LLMs' accuracy on medical questions drops and their 'humility deficit' grows as the number of plausible answers increases and abstention options shift from assertive to uncertain phrasing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00468","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ReLay: Personalized LLM-Generated Plain-Language Summaries for Better Understanding, but at What Cost?","primary_cat":"cs.CL","submitted_at":"2026-05-01T07:11:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Personalized LLM-generated plain language summaries improve lay readers' comprehension and quality ratings but increase risks of reinforcing biases and introducing hallucinations compared to static expert summaries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27906","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Unstructured Recall to Schema-Grounded Memory: Reliable AI Memory via Iterative, Schema-Aware Extraction","primary_cat":"cs.AI","submitted_at":"2026-04-30T14:14:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Schema-aware iterative extraction turns AI memory into a verified system of record, reaching 90-97% accuracy on extraction and end-to-end memory benchmarks where retrieval baselines score 80-87%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26145","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Ceci n'est pas une explication: Evaluating Explanation Failures as Explainability Pitfalls in Language Learning Systems","primary_cat":"cs.HC","submitted_at":"2026-04-28T22:05:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AI explanations in language learning often fail across six dimensions like diagnostic accuracy and self-regulation support, creating hidden risks that demand better evaluation frameworks such as L2-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25855","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SIEVES: Selective Prediction Generalizes through Visual Evidence Scoring","primary_cat":"cs.CV","submitted_at":"2026-04-28T16:57:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIEVES improves selective prediction coverage up to 3x on OOD VQA benchmarks by training a selector on visual localization quality, generalizing across datasets and proprietary reasoners without specific adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23505","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Uncertainty Propagation in LLM-Based Systems","primary_cat":"cs.SE","submitted_at":"2026-04-26T02:48:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"This paper introduces a systems-level conceptual framing and a three-level taxonomy (intra-model, system-level, socio-technical) for uncertainty propagation in compound LLM applications, along with engineering insights and open challenges.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23333","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Process Supervision of Confidence Margin for Calibrated LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-04-25T14:40:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RLCM trains LLMs with a margin-enhanced process reward that widens the gap between correct and incorrect reasoning steps, improving calibration on math, code, logic, and science tasks without hurting accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21018","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adaptive Test-Time Compute Allocation with Evolving In-Context Demonstrations","primary_cat":"cs.AI","submitted_at":"2026-04-22T19:07:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An adaptive test-time framework uses a warm-up phase on the test set to build evolving in-context examples, then concentrates compute on unresolved queries to outperform static baselines on math, coding, and reasoning tasks with lower total inference cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17284","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HalluClear: Diagnosing, Evaluating and Mitigating Hallucinations in GUI Agents","primary_cat":"cs.AI","submitted_at":"2026-04-19T06:55:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HalluClear supplies a taxonomy, calibrated evaluation, and lightweight post-training mitigation that reduces hallucinations in GUI agents using only 9K samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16672","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Subsumption to Satisfiability: LLM-Assisted Active Learning for OWL Ontologies","primary_cat":"cs.AI","submitted_at":"2026-04-17T20:05:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM-assisted active learning reformulates OWL subsumption checks as satisfiability queries, queries models for counter-concept examples, and ensures errors are only Type II delays rather than inconsistencies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15460","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Crutch or the Ceiling? How Different Generations of LLMs Shape EFL Student Writings","primary_cat":"cs.HC","submitted_at":"2026-04-16T18:19:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Advanced LLMs improve EFL writing scores and diversity for lower-proficiency students but correlate with lower expert ratings on deep coherence, acting more as crutches than scaffolds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14829","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Literal Summarization: Redefining Hallucination for Medical SOAP Note Evaluation","primary_cat":"cs.AI","submitted_at":"2026-04-16T10:04:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Redefining hallucination evaluation for medical SOAP notes to credit clinical reasoning reduces reported hallucination rates from 35% to 9%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13201","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InfiniteScienceGym: An Unbounded, Procedurally-Generated Benchmark for Scientific Analysis","primary_cat":"cs.CL","submitted_at":"2026-04-14T18:23:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"InfiniteScienceGym procedurally generates unbounded scientific repositories with exact ground-truth QA pairs to benchmark LLMs on data reasoning, abstention, and tool use without static datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12632","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Calibration-Aware Policy Optimization for Reasoning LLMs","primary_cat":"cs.LG","submitted_at":"2026-04-14T12:03:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAPO improves LLM calibration by up to 15% while matching or exceeding GRPO accuracy through logistic AUC loss and noise masking, enabling better abstention and scaling performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12543","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Two-Stage LLM Framework for Accessible and Verified XAI Explanations","primary_cat":"cs.AI","submitted_at":"2026-04-14T10:15:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A two-stage LLM explainer-verifier framework with iterative refeed improves faithfulness and accessibility of XAI explanations, as shown in experiments across five techniques and three LLM families, with EPR analysis indicating progressive stabilization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02915","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Should a Language Model Trust Itself? Same-Model Self-Verification as a Conditional Confidence Signal","primary_cat":"cs.CL","submitted_at":"2026-04-08T20:15:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Self-verification acts as a conditional confidence signal for language models rather than a reliable general-purpose uncertainty estimator.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03216","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BAS: A Decision-Theoretic Approach to Evaluating Large Language Model Confidence","primary_cat":"cs.CL","submitted_at":"2026-04-03T17:44:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BAS aggregates utility from an answer-or-abstain model across risk thresholds and is uniquely maximized by truthful confidence estimates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03045","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"STEAR: Layer-Aware Spatiotemporal Evidence Intervention for Hallucination Mitigation in Video Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-03T13:52:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"STEAR reduces spatial and temporal hallucinations in Video-LLMs via layer-aware evidence intervention from middle decoder layers in a single-encode pass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02784","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EnsemHalDet: Robust VLM Hallucination Detection via Ensemble of Internal State Detectors","primary_cat":"cs.CV","submitted_at":"2026-04-03T06:48:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EnsemHalDet improves hallucination detection in VLMs by ensembling independent detectors on diverse internal states, yielding higher AUC than single-detector baselines on VQA datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.27098","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ensemble-Based Uncertainty Estimation for Code Correctness Estimation","primary_cat":"cs.SE","submitted_at":"2026-03-28T02:37:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Ensemble Semantic Entropy improves correlation with code correctness over single-model methods and powers a cascading scaling system that cuts FLOPs by 64.9% while preserving performance on LiveCodeBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}