{"total":26,"items":[{"citing_arxiv_id":"2605.13595","ref_index":33,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Inducing Artificial Uncertainty in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-13T14:30:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Inducing artificial uncertainty on trivial tasks allows training probes that achieve higher calibration on hard data than standard approaches while retaining performance on easy data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12918","ref_index":51,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CommonWhy: A Dataset for Evaluating Entity-Based Causal Commonsense Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-13T02:47:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CommonWhy is a new dataset of 15,000 why-questions for evaluating LLMs on entity-based causal commonsense reasoning grounded in Wikidata.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12370","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Context Convergence Improves Answering Inferential Questions","primary_cat":"cs.CL","submitted_at":"2026-05-12T16:39:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Passages made from high-convergence sentences improve LLM performance on inferential questions compared to cosine similarity selection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08070","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"VecCISC: Improving Confidence-Informed Self-Consistency with Reasoning Trace Clustering and Candidate Answer Selection","primary_cat":"cs.AI","submitted_at":"2026-05-08T17:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VecCISC filters equivalent, degenerate, or hallucinated reasoning traces via semantic clustering before critic evaluation, reducing token use by 47% with no loss in accuracy versus standard CISC.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06856","ref_index":206,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Benchmarked Yet Not Measured -- Generative AI Should be Evaluated Against Real-World Utility","primary_cat":"cs.LG","submitted_at":"2026-05-07T18:56:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Generative AI evaluation must shift from static benchmark scores to measuring sustained improvements in human capabilities within specific deployment contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25578","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Marco-MoE: Open Multilingual Mixture-of-Expert Language Models with Efficient Upcycling","primary_cat":"cs.CL","submitted_at":"2026-04-28T12:45:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Marco-MoE delivers open multilingual MoE models with 5% activation sparsity that outperform similarly sized dense models on English and multilingual benchmarks through efficient upcycling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19921","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Commonsense Knowledge with Negation: A Resource to Enhance Negation Understanding","primary_cat":"cs.CL","submitted_at":"2026-04-21T19:00:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Augmenting commonsense knowledge corpora with negation produces over 2M new triples that benefit LLM negation understanding when used for pre-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19089","ref_index":152,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Towards Scalable Lifelong Knowledge Editing with Selective Knowledge Suppression","primary_cat":"cs.AI","submitted_at":"2026-04-21T05:02:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LightEdit enables scalable lifelong knowledge editing in LLMs via selective knowledge retrieval and probability suppression during decoding, outperforming prior methods on ZSRE, Counterfact, and RIPE while reducing training costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19048","ref_index":43,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SAMoRA: Semantic-Aware Mixture of LoRA Experts for Task-Adaptive Learning","primary_cat":"cs.CL","submitted_at":"2026-04-21T03:55:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SAMoRA is a parameter-efficient fine-tuning framework that uses semantic-aware routing and task-adaptive scaling within a Mixture of LoRA Experts to improve multi-task performance and generalization over prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18389","ref_index":39,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Understanding the Prompt Sensitivity","primary_cat":"cs.CL","submitted_at":"2026-04-20T15:13:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs disperse meaning-preserving prompts internally instead of clustering them, which produces an excessively high upper bound on output log-probability differences via Taylor expansion and Cauchy-Schwarz.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17928","ref_index":76,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"HEALing Entropy Collapse: Enhancing Exploration in Few-Shot RLVR via Hybrid-Domain Entropy Dynamics Alignment","primary_cat":"cs.LG","submitted_at":"2026-04-20T08:09:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HEAL mitigates entropy collapse in few-shot RLVR by selectively adding general-domain data and aligning trajectory-level entropy dynamics, matching full-shot performance with 32 target samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17621","ref_index":29,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"KnowledgeBerg: Evaluating Systematic Knowledge Coverage and Compositional Reasoning in Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-04-19T21:18:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KnowledgeBerg benchmark shows open-source LLMs achieve only 5.26-36.88 F1 on universe enumeration and 16-44% accuracy on knowledge-grounded compositional reasoning, with persistent failures in completeness, awareness, and application.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15741","ref_index":47,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Learning Uncertainty from Sequential Internal Dispersion in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-17T06:31:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SIVR detects LLM hallucinations by learning from token-wise and layer-wise variance patterns in internal hidden states, outperforming baselines with better generalization and less training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02176","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Adam's Law: Textual Frequency Law on Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-02T15:39:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Frequent sentence-level text improves LLM prompting and fine-tuning performance across math, translation, commonsense, and tool-use tasks via a proposed frequency law and curriculum ordering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.01624","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"OSCAR: Orchestrated Self-verification and Cross-path Refinement","primary_cat":"cs.AI","submitted_at":"2026-04-02T05:02:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OSCAR reduces hallucinations in diffusion language models by localizing commitment uncertainty with cross-chain entropy on parallel trajectories and applying evidence-guided remasking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.02737","ref_index":226,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model","primary_cat":"cs.CL","submitted_at":"2025-02-04T21:43:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SmolLM2 is a 1.7B-parameter language model that outperforms Qwen2.5-1.5B and Llama3.2-1B after overtraining on 11 trillion tokens using custom FineMath, Stack-Edu, and SmolTalk datasets in a multi-stage pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.06292","ref_index":98,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery","primary_cat":"cs.AI","submitted_at":"2024-08-12T16:58:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"The AI Scientist framework enables LLMs to independently conduct the full scientific process from idea generation to paper writing and review, demonstrated across three ML subfields with papers costing under $15 each.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.17557","ref_index":43,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale","primary_cat":"cs.CL","submitted_at":"2024-06-25T13:50:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FineWeb is a curated 15T-token web dataset that produces stronger LLMs than prior open collections, while its educational subset sharply improves performance on MMLU and ARC benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.04434","ref_index":134,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","primary_cat":"cs.CL","submitted_at":"2024-05-07T15:56:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepSeek-V2 delivers top-tier open-source LLM performance using only 21B active parameters by compressing the KV cache 93.3% and cutting training costs 42.5% via MLA and DeepSeekMoE.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.02954","ref_index":138,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","primary_cat":"cs.CL","submitted_at":"2024-01-05T18:59:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DeepSeek LLM 67B exceeds LLaMA-2 70B on code, mathematics and reasoning benchmarks after pre-training on 2 trillion tokens and alignment via SFT and DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.12983","ref_index":139,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GAIA: a benchmark for General AI Assistants","primary_cat":"cs.CL","submitted_at":"2023-11-21T20:34:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GAIA benchmark shows humans at 92% accuracy on simple real-world questions far outperform current AI systems at 15%, proposing this gap as a key milestone for general AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.01405","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Representation Engineering: A Top-Down Approach to AI Transparency","primary_cat":"cs.LG","submitted_at":"2023-10-02T17:59:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Representation engineering uses population-level representations in deep neural networks to monitor and manipulate cognitive phenomena like honesty and harmlessness, providing simple effective baselines for LLM safety.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.10403","ref_index":143,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"PaLM 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2023-05-17T17:46:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PaLM 2 reports state-of-the-art results on language, reasoning, and multilingual tasks with improved efficiency over PaLM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.17564","ref_index":113,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"BloombergGPT: A Large Language Model for Finance","primary_cat":"cs.LG","submitted_at":"2023-03-30T17:30:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BloombergGPT is a 50B parameter LLM trained on a 708B token mixed financial and general dataset that outperforms prior models on financial benchmarks while preserving general LLM performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.02311","ref_index":152,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"PaLM: Scaling Language Modeling with Pathways","primary_cat":"cs.CL","submitted_at":"2022-04-05T16:11:45+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaLM 540B demonstrates continued scaling benefits by setting new few-shot SOTA results on hundreds of benchmarks and outperforming humans on BIG-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2201.11903","ref_index":64,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2022-01-28T02:33:07+00:00","verdict":"ACCEPT","verdict_confidence":"HIGH","novelty_score":9.0,"formal_verification":"none","one_line_summary":"Chain-of-thought prompting, by including intermediate reasoning steps in few-shot examples, elicits strong reasoning abilities in large language models on arithmetic, commonsense, and symbolic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}