{"total":66,"items":[{"citing_arxiv_id":"2606.28615","ref_index":29,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"What LLMs explain is not what they believe: Evaluating explanation sufficiency under models' own input beliefs","primary_cat":"cs.LG","submitted_at":"2026-06-26T21:14:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes SCSuff metric for evaluating LLM explanation sufficiency via model-generated alternative inputs, showing explanations are typically insufficient and predictable from hidden states.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26396","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"At the Edge of Understanding: Sparse Autoencoders Trace The Limits of Transformer Generalization","primary_cat":"cs.LG","submitted_at":"2026-06-24T21:26:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Sparse autoencoders show OOD prompts increase fallacious concept activation in transformers, offering a mechanistic measure of shift and a path to robust fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25476","ref_index":59,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Red Teaming Framework for Large Language Models: A Case Study on Faithfulness Evaluation","primary_cat":"cs.CL","submitted_at":"2026-06-24T07:00:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Introduces a multi-role red teaming framework using attacker and jury models that increases attack success rates by up to 7.9% on LLM faithfulness in question-answering tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19079","ref_index":42,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ARIADNE: Agnostic Routing for Inference-time Adapter DyNamic sElection","primary_cat":"cs.AI","submitted_at":"2026-06-17T13:50:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ARIADNE routes queries to the best adapter via embedding-space centroid proximity, recovering 97.44% of upper-bound performance on 23 NLP tasks and 89.7% selection accuracy on 44 tasks without training or internal access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12397","ref_index":40,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Redesign Mixture-of-Experts Routers with Manifold Power Iteration","primary_cat":"cs.LG","submitted_at":"2026-06-10T17:57:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Manifold Power Iteration aligns MoE router rows with principal singular directions of experts via a power-then-retract process, with theory showing convergence and experiments on 1B-11B models showing gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12243","ref_index":70,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"VIA-SD: Verification via Intra-Model Routing for Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-06-10T15:45:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VIA-SD adds a routed slim-verifier tier between direct acceptance and full-model verification in speculative decoding, cutting rejection rates 0.10-0.22 and yielding 10-20% speedups over prior SD methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12117","ref_index":41,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Soft-Prompt Tuning for Fair and Efficient LLM Benchmark Evaluation","primary_cat":"cs.CL","submitted_at":"2026-06-10T14:12:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Soft-prompt tuning with 10 vectors improves format compliance on LLM benchmarks and provides a low-cost proxy for comparing base models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08347","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Tensorizing Engram: Sharing Latents Across N-Gram Embeddings is Beneficial in LLMs","primary_cat":"cs.CL","submitted_at":"2026-06-06T21:36:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TN-gram replaces per-order hash tables in n-gram memory modules with a CP tensor factorization that shares token-position factors and uses order-absorption vectors, achieving comparable or better performance with fewer parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06758","ref_index":14,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Diagnosing Evidence Utilization in Long-Context and Retrieval-Augmented Language Models under Matched Evidence Conditions","primary_cat":"cs.CL","submitted_at":"2026-06-04T22:44:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a matched four-condition protocol and ONCU metric to diagnose evidence utilization in long-context and RAG models across synthetic and multi-hop QA tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05875","ref_index":49,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"QCFuse: Query-Aware Cache Fusion via Compressed View for Efficient RAG Serving","primary_cat":"cs.AI","submitted_at":"2026-06-04T08:47:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QCFuse achieves full-prefill quality in RAG with 1.7x average prefill speedup over full prefill and 1.5x over ProphetKV via compressed query-aware cache fusion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05054","ref_index":160,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Boosting Self-Consistency with Ranking","primary_cat":"cs.CL","submitted_at":"2026-06-03T16:12:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RISC reformulates self-consistency answer selection as a ranking task solved by a lightweight LambdaRank model with five hand-designed features, yielding better accuracy-efficiency trade-offs than majority voting on QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04924","ref_index":51,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Can Crowdsourcing Survive the LLM Era? A Community Survey on Human Data Collection","primary_cat":"cs.CL","submitted_at":"2026-06-03T14:18:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Survey of 155 researchers finds 44% observed LLM usage in crowdsourced data, with high awareness but insufficient mitigation efforts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03329","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"InfoMem: Training Long-Context Memory Agents with Answer-Conditioned Information Gain","primary_cat":"cs.AI","submitted_at":"2026-06-02T08:39:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InfoMem is an answer-conditioned information gain reward for RL training of long-context memory agents that improves performance when applied to successful trajectories and normalized.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00683","ref_index":44,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"OCC-RAG: Optimal Cognitive Core for Faithful Question Answering","primary_cat":"cs.CL","submitted_at":"2026-05-30T11:42:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OCC-RAG develops task-specialized SLMs (0.6B and 1.7B) via a new synthetic data pipeline for multi-hop reasoning and context faithfulness, claiming to match or exceed 2-6x larger general models on HotpotQA, MuSiQue, TAT-QA, ConFiQA, and MuSiQue-Un.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28337","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Systems-Level Analysis of Sensitivity, Robustness, and Stability in Retrieval-Augmented Generation","primary_cat":"cs.IR","submitted_at":"2026-05-29T17:24:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Empirical runs across 56 settings on a fixed 500-question set show non-monotonic downstream scores and preprocessing losses, leading to a call for multi-stage RAG evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31354","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Diagnosing Failure Modes of Shared-State Collaboration in Resource-Constrained Visual Agents","primary_cat":"cs.AI","submitted_at":"2026-05-29T14:29:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces CoSee auditing framework and identifies Noise Reinforcement and Policy Collapse as dominant failure modes when weak 4B-8B models use shared state for multi-page visual QA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07597","ref_index":37,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Repetition Mismatch: Why Data Mixture Experiments Don't Scale and How to Fix Them","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:08:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Repetition rate mismatch between small-scale proxies and target budgets is the main reason data mixture experiments do not scale; a subsampling procedure that equalizes repetition rates recovers optimal mixtures from 1/16-scale experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30022","ref_index":24,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Give it Space! Explicit Disentangling of Positional and Semantic Representations in Encoders","primary_cat":"cs.CL","submitted_at":"2026-05-28T14:42:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Explicitly disentangling semantic and positional streams in a Transformer encoder reveals that absolute positional representations collapse to a 2D document-structure manifold, attention heads specialize by role, and the approach improves linguistic probing performance on 49 of 65 phenomena.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26366","ref_index":34,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Automatic Layer Selection for Hallucination Detection","primary_cat":"cs.AI","submitted_at":"2026-05-25T22:28:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FEPoID automatically selects optimal or near-optimal intermediate layers for hallucination detection across LLM architectures and tasks, outperforming prior criteria and baselines, with an added truncation step that further improves performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26293","ref_index":39,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CroCo: Cross-Lingual Contrastive Preference Tuning on Self-Generations","primary_cat":"cs.CL","submitted_at":"2026-05-25T19:30:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CroCo applies English-reward-ranked self-generations for contrastive preference tuning that improves two LLMs on structured and open-ended tasks across 14 languages without language-specific annotations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25263","ref_index":20,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Mimir: Large-scale Multilingual Concept Modeling","primary_cat":"cs.CL","submitted_at":"2026-05-24T21:26:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Mimir is a 1.6B multilingual concept model pretrained on 38.9 billion sentences across 46 languages and instruction-tuned on 66.8 million sentences across 35 languages, then compared to a token-based LM of similar size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24299","ref_index":34,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LLMs Show No Signs Of Individuated Metacognition","primary_cat":"cs.LG","submitted_at":"2026-05-22T23:54:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM confidence judgments are dominated by a shared difficulty factor across models, with the confidence-performance link collapsing after removing agreed items, yielding no evidence for individuated metacognition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22963","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Graph Alignment Topology as an Inductive Bias for Grounding Detection","primary_cat":"cs.CL","submitted_at":"2026-05-21T18:49:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A GNN trained on bipartite alignment graphs between references and LLM generations reports state-of-the-art hallucination detection across four datasets, beating prior methods and GPT-4o.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22769","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Understanding Data Temporality Impact on Large Language Models Pre-training","primary_cat":"cs.CL","submitted_at":"2026-05-21T17:31:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pre-training 6B LLMs on temporally ordered Common Crawl snapshots yields models with improved factual freshness and temporal precision over shuffled baselines while matching on general language understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17989","ref_index":40,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Predictive Prefetching for Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-05-18T07:45:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces predictive prefetching for RAG that anticipates retrieval needs several tokens ahead via three components, reporting up to 43.5% latency reduction and 62.4% TTFT improvement while preserving answer quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12227","ref_index":1,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Recipe for Long-Context Reasoning in Large Language Models via On-Policy Optimization and Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:04:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Combines GRPO with teacher-guided on-policy distillation and introduces LongBlocks dataset to yield more stable long-context reasoning than either method alone.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Together, these components yield consistent improvements in long-context performance while preserving the existing short-context capabilities of the starting model. 2 Preliminary 2.1 Token-level MDP for Language Generation We formulate autoregressive language generation as a token-level MDPM= (S,A,r,T) . At each generation step t, the state st ∈S consists of the prompt p concatenated with all previously generated tokens: st = [p 1, . . ., pM, o1, . . ., ot−1]. The action at time t is the next token ot ∈A selected from the model vocabulary V. Transitions are deterministic, with each new state formed by appending the new token: st+1 = [s t; ot]. The initial state s1 is induced by the prompt p∼ P , where P denotes the distribution over prompts. An episode ends when the policy outputs an end-of-sequence token [eos] or when the total token budget is"},{"citing_arxiv_id":"2605.11608","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"PRISM: A Geometric Risk Bound that Decomposes Drift into Scale, Shape, and Head","primary_cat":"cs.CL","submitted_at":"2026-05-12T06:40:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRISM supplies a geometric upper bound on LLM variant risk that splits drift into scale, shape, and head axes and doubles as a differentiable regularizer against forgetting.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"2226 17.8641 266.0867 0.3658 BnB INT8 138.96 139.09 0.9880 56.3886 056.3886 0.0265 NF4 138.96 144.16 0.9124 155.4506 0 155.4506 0.0750 FP4 138.96 138.10 0.9196 145.1767 0 145.1767 0.1306 GPTQ GPTQ-4bit 138.96 140.37 0.9298 136.7867 0 136.7867 0.1422 Benchmarks and scoring.Five benchmarks:MMLU[ 28],ARC[ 29] (multiple-choice knowledge), TriviaQA[ 30],SQuAD[ 31] (short-horizon QA), andGSM8K[ 32] (multi-step reasoning). All risks are computed teacher-forced (prompt c and targets y scored in a single forward pass over the gold span), producing a deterministic per-sample CE loss whose expectation gives the model's riskRM , and|∆R|is the target-vs-proxy gap we report. Calibration and hyperparameters.PRISM and |∆R| are evaluated on fixed held-out subsets"},{"citing_arxiv_id":"2605.10296","ref_index":55,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Qwen Goes Brrr: Off-the-Shelf RAG for Ukrainian Multi-Domain Document Understanding","primary_cat":"cs.CL","submitted_at":"2026-05-11T09:55:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A RAG pipeline with contextual PDF chunking, question-and-answer-aware retrieval and reranking using Qwen3 models reaches 0.96 accuracy on a Ukrainian multi-domain document QA shared task.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08636","ref_index":20,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"EdgeFlowerTune: Evaluating Federated LLM Fine-Tuning Under Realistic Edge System Constraints","primary_cat":"cs.CL","submitted_at":"2026-05-09T03:02:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EdgeFlowerTune is a real-device benchmark that jointly assesses model quality and system costs for federated LLM fine-tuning on edge hardware using three protocols: Quality-under-Budget, Cost-to-Target, and Robustness.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and Blaise Agüera y Arcas. Communication-efficient learning of deep networks from decentralized data. InProceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS), pages 1273-1282, 2017. [19] OpenAI. Gpt-4 technical report.arXiv preprint arXiv:2303.08774, 2023. [20] Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. SQuAD: 100,000+ questions for machine comprehension of text. InProceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, pages 2383-2392, Austin, Texas, 2016. Association for Computational Linguistics. doi: 10.18653/v1/D16-1264. [21] Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi."},{"citing_arxiv_id":"2605.06856","ref_index":204,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Benchmarked Yet Not Measured -- Generative AI Should be Evaluated Against Real-World Utility","primary_cat":"cs.LG","submitted_at":"2026-05-07T18:56:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Generative AI evaluation must shift from static benchmark scores to measuring sustained improvements in human capabilities within specific deployment contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06505","ref_index":33,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"PACZero: PAC-Private Fine-Tuning of Language Models via Sign Quantization","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:20:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PACZero achieves zero mutual information privacy in LLM fine-tuning via sign-quantized subset-aggregated ZO gradients, delivering near non-private accuracy on SST-2 at I=0.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"agreed sign is uninformative aboutwhichcandidate is the secret yet still drives optimization; on disagreement, the coin flip is independent of S∗ and contributes no progress. Useful progress thus comes from unanimity steps alone. 6 4 Experiments We evaluate PACZEROon the two LLM tasks reported by the strongest DP zeroth-order baseline, DP-AggZO [5]:SST-2(binary sentiment classification) [ 34] andSQuAD(extractive QA, F1) [ 31]. Each task is run onOPT-1.3BandOPT-6.7B[ 47] across two parameter tracks,LoRA r=8 [17] andfull-parameter fine-tuning(FT). The section is structured around three claims:(i)PACZERO- MI matches DP-ZO at the matched-MIA comparison points used by prior DP-ZO work, while PACZERO-ZPL reaches I(S ∗;Y 1:T )=0 , with the strongest PACZERO-ZPL cell on SST-2 6."},{"citing_arxiv_id":"2605.05392","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Generating Query-Focused Summarization Datasets from Query-Free Summarization Datasets","primary_cat":"cs.CL","submitted_at":"2026-05-06T19:25:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An evidence-based model generates queries from query-free datasets, yielding summaries with competitive ROUGE scores to those using original queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04901","ref_index":65,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"On the (In-)Security of the Shuffling Defense in the Transformer Secure Inference","primary_cat":"cs.CR","submitted_at":"2026-05-06T13:31:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An attack aligns differently shuffled intermediate activations from secure Transformer inference queries to recover model weights with low error using roughly one dollar of queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03379","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Two Calls, Two Moments, and the Vote-Accuracy Curve of Repeated LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-05-05T05:40:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Two calls per example identify the first two moments of latent correctness probability, enabling exact bounds on the vote-accuracy curve for any majority-vote budget under conditional i.i.d. assumptions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03045","ref_index":112,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TCD-Arena: Assessing Robustness of Time Series Causal Discovery Methods Against Assumption Violations","primary_cat":"cs.LG","submitted_at":"2026-05-04T18:12:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TCD-Arena is a new customizable testing framework that runs millions of experiments to map how 33 different assumption violations affect time series causal discovery methods and shows ensembles can boost overall robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00668","ref_index":69,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SENECA: Small-Sample Discrete Entropy Estimation via Self-Consistent Missing Mass","primary_cat":"cs.IT","submitted_at":"2026-05-01T13:50:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SENECA uses a novel self-consistent missing mass calculation to improve discrete entropy estimates in small-sample regimes and outperforms alternatives in numerical tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00939","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"From Flat Facts to Sharp Hallucinations: Detecting Stubborn Errors via Gradient Sensitivity","primary_cat":"cs.LG","submitted_at":"2026-05-01T04:11:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EPGS detects high-confidence factual errors in LLMs by using embedding perturbations to measure gradient sensitivity as a proxy for sharp versus flat minima.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27115","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Exploring the Limits of Pruning: Task-Specific Neurons, Model Collapse, and Recovery in Task-Specific Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-29T19:08:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Selective pruning of low-activation neurons in task-specific LLMs preserves accuracy better than random pruning, but removing roughly 10% of highly selective neurons triggers total collapse, with fine-tuning recovering much of the lost performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23267","ref_index":55,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Fine-tuning vs. In-context Learning in Large Language Models: A Formal Language Learning Perspective","primary_cat":"cs.CL","submitted_at":"2026-04-25T12:19:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A controlled formal language task reveals fine-tuning outperforms in-context learning on in-distribution generalization but equals it on out-of-distribution, with ICL showing greater sensitivity to model size and tokenization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17621","ref_index":26,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"KnowledgeBerg: Evaluating Systematic Knowledge Coverage and Compositional Reasoning in Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-04-19T21:18:42+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"C: Instruments D: Abstract concepts (Tip) Question (Iceberg) Knowledge and Reasoning To determine which category is most prevalent, we must carefully classify all 88 IAU constellations. Category A (Biological): Apus, Aquila, Aries, Camelopardalis, Cancer, … (36). Category B (Mythological): Andromeda, Aquarius, Auriga, Boötes, Cassiopeia, Centaurus, … (26). Category C (Instruments): Antlia, Caelum, Circinus, Fornax, Horologium, Microscopium, … (12). Category D (abstract concepts): Ara, Carina, Coma Berenices, Crater, Crux, Eridanus, … (14). A=36, B=26, C=12, D=14 So, the answer is A Figure 1: Illustration of the tip-of-the-iceberg phe- nomenon. A surface-simple question (tip) implicitly depends on abounded universeandcompositional set-"},{"citing_arxiv_id":"2604.15009","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Towards Faster Language Model Inference Using Mixture-of-Experts Flow Matching","primary_cat":"cs.AI","submitted_at":"2026-04-16T13:36:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mixture-of-experts flow matching enables non-autoregressive language models to achieve autoregressive-level quality in three sampling steps, delivering up to 1000x faster inference than diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12469","ref_index":15,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Analyzing the Effect of Noise in LLM Fine-tuning","primary_cat":"cs.LG","submitted_at":"2026-04-14T08:54:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Label noise hurts fine-tuning performance most while grammatical and typographical noise sometimes act as mild regularizers, with changes concentrated in task-specific layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08974","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Confident in a Confidence Score: Investigating the Sensitivity of Confidence Scores to Supervised Fine-Tuning","primary_cat":"cs.CL","submitted_at":"2026-04-10T05:27:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Supervised fine-tuning degrades the correlation between confidence scores and output quality in language models, driven by factors like training distribution similarity rather than true quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.17314","ref_index":269,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Open Datasets in Learning Analytics: Trends, Challenges, and Best PRACTICE","primary_cat":"cs.CY","submitted_at":"2026-02-19T12:23:25+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"A survey of 172 open educational datasets from 204 papers across LAK, EDM, and AIED conferences reveals trends, 143 previously uncatalogued datasets, field gaps, and an 8-item PRACTICE checklist for better data publication.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"The Association for Computational Linguistics, Austin, TX, USA, 2383-2392. doi:10.18653/V1/D16-1264 [268] Martina A. Rau, Vincent Aleven, Nikol Rummel, and Zachary A. Pardos. 2014. How Should Intelligent Tutoring Systems Sequence Multiple Graphical Representations of Fractions? A Multi-Methods Study.Int. J. Artif. Intell. Educ.24, 2 (2014), 125-161. doi:10.1007/S40593-013-0011-7 [269] Siyu Ren and Kenny Q. Zhu. 2021. Knowledge-Driven Distractor Generation for Cloze-Style Multiple Choice Questions. InThirty-Fifth AAAI Conference on Artificial Intelligence, AAAI 2021, Thirty-Third Conference on Innovative Applications of Artificial Intelligence, IAAI 2021, The Eleventh Symposium on Educational Advances in Artificial Intelligence, EAAI 2021, Virtual Event, February 2-9, 2021."},{"citing_arxiv_id":"2507.14913","ref_index":20,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"PromptSuite: A Task-Agnostic Framework for Multi-Prompt Generation","primary_cat":"cs.CL","submitted_at":"2025-07-20T10:55:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PromptSuite is a modular, extensible, task-agnostic framework for automatically generating diverse prompt variations to support robust multi-prompt LLM evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.00994","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Should We Still Pretrain Encoders with Masked Language Modeling?","primary_cat":"cs.CL","submitted_at":"2025-07-01T17:45:48+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Controlled ablations of 38 models find MLM superior to CLM on representation benchmarks while CLM offers better data efficiency and stability; a biphasic CLM-then-MLM schedule is optimal under fixed compute and improves when initialized from pretrained CLM models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.14123","ref_index":57,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Sampling from Your Language Model One Byte at a Time","primary_cat":"cs.CL","submitted_at":"2025-06-17T02:37:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An inference-time technique turns BPE-based LMs into byte- or character-level models, solving the prompt boundary problem while unifying vocabularies across different tokenizers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.03387","ref_index":168,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LIMO: Less is More for Reasoning","primary_cat":"cs.CL","submitted_at":"2025-02-05T17:23:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LIMO achieves 63.3% on AIME24 and 95.6% on MATH500 via supervised fine-tuning on roughly 1% of the data used by prior models, supporting the claim that minimal strategic examples suffice when pre-training has already encoded domain knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.19098","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SyMerge: From Non-Interference to Synergistic Merging via Single-Layer Adaptation","primary_cat":"cs.LG","submitted_at":"2024-12-26T07:42:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SyMerge merges models via single-layer adaptation and expert-guided self-labeling to achieve task synergy, reporting SOTA results on vision, dense prediction, and NLP tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.05527","ref_index":53,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"How Good is Your Wikipedia? Auditing Data Quality for Low-resource and Multilingual NLP","primary_cat":"cs.CL","submitted_at":"2024-11-08T12:35:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The study filters non-English Wikipedia, reveals quality problems, proposes a 4-level ranking, and shows filtered data matches or beats raw data in language modeling with largest gains for lower-quality editions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}