{"total":12,"items":[{"citing_arxiv_id":"2606.09577","ref_index":20,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Code Is More Than Text: Uncertainty Estimation for Code Generation","primary_cat":"cs.CL","submitted_at":"2026-06-08T14:52:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Three code-specific uncertainty axes (lexical, algorithmic, functional) yield an ensemble that raises average AUROC from 0.696 to 0.776 across five code LLMs, with one single-pass signal matching multi-pass baselines at lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04262","ref_index":42,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Can I Take Another Dose? Evaluating LLM Decision-Making Under Temporal Uncertainty in OTC Dosing QA","primary_cat":"cs.CL","submitted_at":"2026-06-02T22:30:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces DOSEBENCH benchmark and shows four LLMs often fail at rolling 24-hour dose calculations and constraint adherence in OTC dosing decisions despite appearing confident.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09876","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Calibrating Overconfidence Without Sacrificing Confidence: Probe-Conditioned Head Intervention for LLMs","primary_cat":"cs.LG","submitted_at":"2026-06-02T21:13:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PCHI uses a frozen probe to detect likely wrong-but-confident LLM responses and conditionally intervenes on attention heads during confidence generation, converting 82.2% of wrong high-confidence outputs to low while damaging only 5.1% of correct ones and lowering ECE from 21.9% to 9.2%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03969","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Quantifying Faithful Confidence Expression in Large Reasoning Models","primary_cat":"cs.CL","submitted_at":"2026-06-02T17:53:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new framework quantifies faithful confidence expression in large reasoning models by comparing linguistic decisiveness to token probabilities, hidden states, and response consistency, revealing it as a persistent challenge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00467","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"On the Limits of LLM Adaptability: Impact of Model-Internalized Priors on Annotation Task Performance","primary_cat":"cs.CL","submitted_at":"2026-05-30T01:21:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs correct only 34.8% of zero-shot annotation errors via prompting, and Definition-Specific Familiarity correlates positively with performance (partial r = +0.41) while memorization metrics do not.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20490","ref_index":31,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ECUAS$_n$: A family of metrics for principled evaluation of uncertainty-augmented systems","primary_cat":"cs.AI","submitted_at":"2026-05-19T20:55:41+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01133","ref_index":37,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"When Embedding-Based Defenses Fail: Rethinking Safety in LLM-Based Multi-Agent Systems","primary_cat":"cs.CR","submitted_at":"2026-05-01T22:15:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24076","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"An Information-Geometric Framework for Stability Analysis of Large Language Models under Entropic Stress","primary_cat":"cs.AI","submitted_at":"2026-04-27T06:00:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A thermodynamic-inspired information-geometric framework defines a composite LLM stability score that outperforms a utility-entropy baseline by 0.0299 on average across 80 observations, with gains increasing at higher entropy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17304","ref_index":53,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Efficient Test-Time Scaling via Temporal Reasoning Aggregation","primary_cat":"cs.AI","submitted_at":"2026-04-19T07:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRACE aggregates answer consistency and confidence trajectory over multiple reasoning steps to decide when to halt inference, reducing token usage by 25-30% while keeping accuracy within 1-2% of full reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19781","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Do Small Language Models Know When They're Wrong? Confidence-Based Cascade Scoring for Educational Assessment","primary_cat":"cs.CY","submitted_at":"2026-03-29T20:28:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Verbalized confidence from small LMs enables cost-effective cascade routing for automated educational scoring, matching large-model accuracy at 76% lower cost when discrimination is strong.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.00439","ref_index":14,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Improving the Distributional Alignment of LLMs using Supervision","primary_cat":"cs.CL","submitted_at":"2025-07-01T05:46:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Simple supervision improves LLM distributional alignment with diverse population groups on three datasets, with evaluation across multiple models and prompts providing a benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.14427","ref_index":20,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Token-Level Density-Based Uncertainty Quantification Methods for Eliciting Truthfulness of Large Language Models","primary_cat":"cs.CL","submitted_at":"2025-02-20T10:25:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adapts multi-layer token-level Mahalanobis distance with supervised linear regression to yield improved uncertainty scores for LLM truthfulness tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}