{"total":92,"items":[{"citing_arxiv_id":"2606.27570","ref_index":16,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Auditing AI Investment Recommendations as Executable Actions","primary_cat":"cs.LO","submitted_at":"2026-06-25T21:56:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a protocol scoring AI investment advisors on validity under constraints, stability, and agreement with a deterministic baseline, showing agreement often masks invalid actions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27326","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hallucination in World Models is Predictable and Preventable","primary_cat":"cs.LG","submitted_at":"2026-06-25T17:38:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hallucination in world models is a data coverage issue predictable by three signals and preventable through targeted training sampling and online data collection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26519","ref_index":6,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"What the LLM Should Not Say: Boundary-Aware Context Grounding for A Seven-Channel EEG Agent","primary_cat":"cs.AI","submitted_at":"2026-06-25T01:51:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"NeuraDock Agent is an open-source architecture that pairs a local EEG engine with a restricted LLM context pack to enforce hardware and implementation boundaries for seven-channel recordings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26396","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"At the Edge of Understanding: Sparse Autoencoders Trace The Limits of Transformer Generalization","primary_cat":"cs.LG","submitted_at":"2026-06-24T21:26:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Sparse autoencoders show OOD prompts increase fallacious concept activation in transformers, offering a mechanistic measure of shift and a path to robust fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25402","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LibEvoBench: Probing Temporal Knowledge Stratification in Code Generation Models","primary_cat":"cs.SE","submitted_at":"2026-06-24T04:58:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LibEvoBench benchmark shows LLMs are version-oblivious on evolving APIs, with documentation helping but version specification not.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23491","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hallucinations in Organization-backed AI advisors: Evidence about Skepticism, Verification, and Reliance in Goal-Directed Use","primary_cat":"cs.HC","submitted_at":"2026-06-22T15:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Literature review synthesizing evidence on user skepticism, verification, and reliance with hallucinating AI advisors, noting that output-related cues like warnings show weak effects and that content category has not been experimentally varied.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23276","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Exposing the Illusion of Erasure in Knowledge Editing for LLMs","primary_cat":"cs.LG","submitted_at":"2026-06-22T12:53:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Knowledge editing methods redistribute and suppress rather than overwrite facts in LLMs, creating narrow vulnerable regions in representation space that adversarial prompts can exploit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21517","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MedHal-Loc: Are \"Explainable-by-Architecture\" Medical Hallucination Detectors Faithful Localizers? A Localization Benchmark","primary_cat":"cs.CL","submitted_at":"2026-06-19T15:11:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MedHal-Loc benchmark shows KG-triple hallucination detectors localize errors no better than chance on controlled medical statements due to entity extraction limits, while NLI and consistency methods succeed above chance, and real hallucinations are mostly diffuse conclusion changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21408","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Vaani Benchmark V1.0: An Inclusive Multimodal Benchmark Dataset for Hindi","primary_cat":"eess.AS","submitted_at":"2026-06-19T13:20:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vaani Benchmark V1.0 is a multimodal Hindi ASR dataset from 104 districts featuring spontaneous speech recordings in real-world conditions and three independent transcriptions per segment for robust multi-reference evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20890","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Topic-to-Timestamp Alignment by Constrained Evidence Selection","primary_cat":"cs.CL","submitted_at":"2026-06-18T19:38:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Constrained candidate selection from retrieved chunks raises Recall@5 from 31.9% to 50.0% and parseable outputs on 420 queries from 200 municipal meeting transcripts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20761","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Integrating Large Language Model Agents with Digital Twins for Industrial Autonomous Systems","primary_cat":"cs.SE","submitted_at":"2026-06-18T09:48:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A TPSR-based framework with four LLM roles integrates language model reasoning into industrial automation via digital twins, achieving high task executability in case studies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18976","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CAPRA: Scaling Feedback on Software Architecture Deliverables with a Multi-Agent LLM System","primary_cat":"cs.SE","submitted_at":"2026-06-17T12:00:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAPRA is a multi-agent LLM system with evidence anchoring and consistency checking that analyzes software architecture deliverables and meets 88.8% of an eight-criterion evaluation on 10 student reports.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17660","ref_index":80,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TuneAhead: Predicting Fine-tuning Performance Before Full Training Begins","primary_cat":"cs.LG","submitted_at":"2026-06-16T08:21:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TUNEAHEAD predicts fine-tuning performance from meta-features and short probes, reporting RMSE 1.47 and 95.1% of predictions within 3 points on 370 held-out runs of Qwen2.5-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17649","ref_index":89,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Risk Decomposition Framework for Pre-Hoc Fine-Tuning Prediction","primary_cat":"cs.LG","submitted_at":"2026-06-16T08:07:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Formulates pre-hoc fine-tuning prediction as stochastic estimation, proves lower bound on optimization variance decay rate, and introduces a three-regime predictability phase diagram.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13348","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"IVIE: A Neuro-symbolic Approach to Incremental and Validated Generation of Interactive Fiction Worlds","primary_cat":"cs.CL","submitted_at":"2026-06-11T13:36:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IVIE generates complete playable interactive fiction worlds via a four-stage incremental pipeline that combines LLM creativity with symbolic validation for coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13211","ref_index":5,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hallucination in Medical Imaging AI: A Cross-Modality Analytical Framework for Taxonomy, Detection, and Mitigation under Regulatory Constraints","primary_cat":"cs.AI","submitted_at":"2026-06-11T11:19:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A literature synthesis that unifies hallucination taxonomies across medical imaging modalities, finds general-purpose foundation models hallucinate less than specialized ones, and maps mitigation to FDA lifecycle frameworks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13111","ref_index":109,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"M\\\"OVE: A Holistic LLM Benchmark for the German Public Sector","primary_cat":"cs.CL","submitted_at":"2026-06-11T09:37:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MÖVE presents a new German-language benchmark evaluating 39 LLMs on performance and governance criteria using ten public-administration datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11046","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Does Reasoning Preserve Alignment? On the Trustworthiness of Large Reasoning Models","primary_cat":"cs.CL","submitted_at":"2026-06-09T16:14:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reasoning models from SFT, RL post-training and distillation exhibit alignment regressions versus matched instruction-tuned baselines on safety, toxicity, bias, ethics, privacy and robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07363","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"On the Shoulders of Giants: Empowering Automated Smart Contract Auditing via the GiAnt Corpus","primary_cat":"cs.CR","submitted_at":"2026-06-05T15:08:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GiANT uses divide-and-conquer and Chain-of-Thought prompting on 388 Code4rena reports to produce a 7,711-finding vulnerability corpus validated at 4.76/5 quality by manual review.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07130","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Explicit Evidence Grounding via Structured Inline Citation Generation","primary_cat":"cs.CL","submitted_at":"2026-06-05T10:42:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"FullCite introduces three strategies for structured inline citation generation in QA and finds LLMs identify relevant documents well but struggle with precise evidence spans on ASQA, BioASQ, and ExpertQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06061","ref_index":9,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Conversational Framework for Human-Robot Collaborative Manipulation with Distributed Generative AI models","primary_cat":"cs.RO","submitted_at":"2026-06-04T12:00:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Presents a distributed ROS 2 framework integrating local LLMs and VLMs for conversational human-robot manipulation tasks with operator confirmation and experimental evaluation on a Franka FR3 arm.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05976","ref_index":15,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The Self-Correction Illusion: LLMs Correct Others but Not Themselves","primary_cat":"cs.AI","submitted_at":"2026-06-04T10:17:00+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Relabeling an identical erroneous claim from the model's own thought role to an external chat role increases explicit correction rates by 23-93 percentage points across 13 model-domain cells, indicating a chat-template artifact rather than a cognitive deficit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05946","ref_index":36,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Short paper: Models in the dark -- Rectification and erasure under GDPR in ML supply chains","primary_cat":"cs.LG","submitted_at":"2026-06-04T09:46:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Survey identifying technical and supply-chain barriers to GDPR data subject rights in ML, with new framing of 'models in the dark' for downstream opacity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05734","ref_index":120,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"When AI Says It Feels","primary_cat":"cs.AI","submitted_at":"2026-06-04T05:49:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs trained via rubric-based self-rewarding RL with GRPO enhanced feeling expression and sycophancy robustness but degraded truthful QA performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05054","ref_index":190,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Boosting Self-Consistency with Ranking","primary_cat":"cs.CL","submitted_at":"2026-06-03T16:12:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RISC reformulates self-consistency answer selection as a ranking task solved by a lightweight LambdaRank model with five hand-designed features, yielding better accuracy-efficiency trade-offs than majority voting on QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04924","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Can Crowdsourcing Survive the LLM Era? A Community Survey on Human Data Collection","primary_cat":"cs.CL","submitted_at":"2026-06-03T14:18:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Survey of 155 researchers finds 44% observed LLM usage in crowdsourced data, with high awareness but insufficient mitigation efforts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04769","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Description-Code Inconsistency in Real-world MCP Servers: Measurement, Detection, and Security Implications","primary_cat":"cs.CR","submitted_at":"2026-06-03T11:51:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Empirical study of 2,214 MCP servers finds 9.93% of 19,200 description-code pairs inconsistent via a new static-analysis-plus-LLM-prompting framework, with security implications.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04127","ref_index":25,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"When Retrieval Doesn't Help: A Large-Scale Study of Biomedical RAG","primary_cat":"cs.CL","submitted_at":"2026-06-02T18:34:54+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Large-scale evaluation shows retrieval-augmented generation yields only marginal and inconsistent gains (1-2 points) over no-retrieval baselines in biomedical QA, with model choice dominating retriever or corpus effects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03924","ref_index":45,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Knowledge Editing in Masked Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-02T17:14:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Locate-then-edit succeeds at the same early-to-mid MLP locations in masked diffusion models as in autoregressive models, but requires optimization over intermediate partial-mask states to handle multi-token targets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03846","ref_index":10,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Clustered Self-Assessment: A Simple yet Effective Method for Uncertainty Quantification in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-02T16:25:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Clustered Self-Assessment groups sampled LLM responses into semantic clusters, presents clusters as multiple-choice options, and uses the LLM's assigned probabilities to those options as direct uncertainty estimates, outperforming entropy baselines with as few as two extra samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03156","ref_index":15,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A cross-domain tropical species dataset with Chinese vernacular names and CITES source links","primary_cat":"cs.CL","submitted_at":"2026-06-02T05:08:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A cross-domain dataset of 410,499 tropical species adds Chinese vernacular names at 99.5% coverage and CITES source links to existing taxonomic identifiers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03095","ref_index":24,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"AI Assistance for Discretionary Work: Increasing Feedback Provision in Higher Education","primary_cat":"cs.HC","submitted_at":"2026-06-02T03:34:17+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Randomized experiment finds AI draft assistance raises feedback provision by teaching assistants 10.8 percentage points without harming quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02444","ref_index":72,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Food Noise & False Safety: A Systematic Evaluation of How LLMs Fail to Adapt to Eating Disorder Queries with Clinician Feedback","primary_cat":"cs.AI","submitted_at":"2026-06-01T16:14:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Systematic evaluation shows LLMs frequently give unsafe responses to eating disorder prompts when linguistic cues signal risk, as measured by varying prompt danger levels with clinician feedback.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01923","ref_index":55,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Resonant Context Anchoring: Decoupling Attention Routing and Signal Gain at Inference Time","primary_cat":"cs.CL","submitted_at":"2026-06-01T08:57:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RCA is a training-free module that boosts input context signal strength in the residual stream of LLMs by orthogonal decoupling of attention routing from value magnitude.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00432","ref_index":41,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Grounded Decoding: Retrieval-Anchored Probability Fusion for Faithful RAG","primary_cat":"cs.LG","submitted_at":"2026-05-29T23:47:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Grounded Decoding fuses full-RAG and retrieval-only next-token distributions via normalized geometric mean from a KL-barycenter to improve factual consistency and citation quality in RAG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29652","ref_index":1,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Think Fast, Talk Smart: Partitioning Deterministic and Neural Computation for Structured Health Text Generation","primary_cat":"cs.AI","submitted_at":"2026-05-28T09:16:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A sleep-health text generation pipeline using deterministic code for analysis followed by one LLM call achieves lower numeric error, instruction-compliance error, and cost than pure LLM baselines across 280 user-nights and six models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29463","ref_index":5,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Honest Lying: Understanding Memory Confabulation in Reflexive Agents","primary_cat":"cs.LG","submitted_at":"2026-05-28T06:56:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reflexive agents confabulate incorrect task interpretations in memory, detected via Reflection Repetition Rate metric, with a programmatic mitigation raising correct object mentions from 0% to 86% in frozen ALFWorld cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23660","ref_index":16,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Using Large Language Models in Physics Education","primary_cat":"physics.ed-ph","submitted_at":"2026-05-22T14:11:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Frontier LLMs from late 2025 reach near-perfect scores on text-based physics problem solving and show improved human-grading alignment, yet still struggle to assign partial credit for flawed reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23080","ref_index":24,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The Attribution Contract: Feature Attribution for Generative Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-21T22:27:04+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20051","ref_index":34,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hunting Vulnerability Variants in AI Infra: Measurement and Reference-Driven Detection","primary_cat":"cs.CR","submitted_at":"2026-05-19T16:07:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Measurement of 688 AI infra repositories shows frequent overlapping vulnerable patterns, and INFRASCOPE detects over 20 variants including 11 acknowledged and 4 with new CVEs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19723","ref_index":46,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Mathematical Reasoning in Large Language Models: Benchmarks, Architectures, Evaluation, and Open Challenges","primary_cat":"cs.CL","submitted_at":"2026-05-19T11:56:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A literature survey synthesizing benchmarks, architectures, training strategies, and evaluation methods for mathematical reasoning in LLMs, based on roughly 120 papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19341","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"HalluWorld: A Controlled Benchmark for Hallucination via Reference World Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T04:29:03+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"HalluWorld is a controlled benchmark using explicit reference world models to automatically label and disentangle hallucinations in LLMs across synthetic environments with varying complexity and observability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19077","ref_index":12,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ReacTOD: Bounded Neuro-Symbolic Agentic NLU for Zero-Shot Dialogue State Tracking","primary_cat":"cs.CL","submitted_at":"2026-05-18T20:06:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReacTOD introduces a bounded neuro-symbolic ReAct architecture with symbolic validation that delivers new zero-shot SOTA joint goal accuracy on MultiWOZ 2.1 and strong results on SGD.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17554","ref_index":9,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Evaluating Deep Research Agents on Expert Consulting Work: A Benchmark with Verifiers, Rubrics, and Cognitive Traps","primary_cat":"cs.AI","submitted_at":"2026-05-17T17:32:52+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16538","ref_index":58,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LLMs in Qualitative Research: Opportunities, Limitations, and Practical Considerations","primary_cat":"cs.HC","submitted_at":"2026-05-15T18:33:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"The paper outlines opportunities, limitations, and practical parameters for integrating LLMs into qualitative research while aligning with epistemological commitments like reflexivity and interpretive judgment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15790","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Fairness-Aware Retrieval Optimization for Retrieval-Augmented Generation","primary_cat":"cs.DB","submitted_at":"2026-05-15T09:47:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces FARO, a scalable quadratic optimization approach for fairness-aware top-k retrieval in RAG that mitigates generation bias via controlled reranking and position-aware propagation modeling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14449","ref_index":22,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"When Answers Stray from Questions: Hallucination Detection via Question-Answer Orthogonal Decomposition","primary_cat":"cs.LG","submitted_at":"2026-05-14T06:44:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QAOD projects away question-aligned directions from answer representations to isolate domain-agnostic factuality signals, enabling efficient hallucination detection with top in-domain AUROC and up to 21% better OOD transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14218","ref_index":37,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Fusion-fission forecasts when AI will shift to undesirable behavior","primary_cat":"cs.AI","submitted_at":"2026-05-14T00:26:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A vector generalization of fusion-fission group dynamics from physics forecasts when AI behavior shifts to undesirable states, validated at 90 percent across seven models and prior to real-world data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14053","ref_index":5,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Derivation Prompting: A Logic-Based Method for Improving Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-05-13T19:20:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Derivation Prompting constructs logic-based derivation trees in RAG generation to improve interpretability and reduce unacceptable answers compared to standard RAG or long-context methods in a case study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16407","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Proof-Carrying Certificates for LLM Pipelines: A Trust-Boundary Architecture","primary_cat":"cs.LO","submitted_at":"2026-05-13T12:01:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"partial","one_line_summary":"Introduces a trust-boundary architecture in Lean 4 with three certificate families and two operators that deliver sorry-free, axiom-audited assurances for LLM pipeline components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}