{"total":22,"items":[{"citing_arxiv_id":"2606.08068","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DICE: Entropy-Regularized Equilibrium Selection for Stable Multi-Agent LLM Coordination","primary_cat":"cs.LG","submitted_at":"2026-06-06T09:33:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DICE formalizes multi-agent LLM coordination as discounted incomplete-information Markov games and introduces Heterogeneous Quantal Response Equilibrium (HQRE) to achieve unique stable equilibria with bounded regret, demonstrated via prompt-control and fine-tuning algorithms on eleven benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04320","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OpenRFM: Dissecting Relational In-Context Learning","primary_cat":"cs.LG","submitted_at":"2026-06-03T00:48:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OpenRFM combines a relational transformer backbone with a batch-level ICL layer and homophily-aware synthetic-plus-real pre-training to improve relational in-context learning by ~30% over prior open models and surpass KumoRFMv1.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03089","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Constitutional On-Policy Safe Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-02T03:17:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"COPSD uses a Cross-SFT cold-start followed by constitution-conditioned distillation to achieve stronger safety-helpfulness balance and lower safety tax on reasoning than prior on-policy self-distillation methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02953","ref_index":263,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Linguistic Productivity in Large Language Models: Models Coerce, but do not Preempt","primary_cat":"cs.CL","submitted_at":"2026-06-01T23:11:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Larger LLMs reproduce constructional productivity via entrenchment in coercion cases with nonce words but fail to use statistical preemption to avoid overgeneralizing semantically plausible but unobserved patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29548","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Why Larger Models Learn More: Effects of Capacity, Interference, and Rare-Task Retention","primary_cat":"cs.LG","submitted_at":"2026-05-28T08:02:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Larger models succeed on rare and complex tasks by reducing gradient interference from common tasks, allowing rare-task features to accumulate, as shown via synthetic task mixtures and OLMo pretraining from 4M to 4B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26350","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Correct Demonstrations Hurt: Rethinking the Role of Exemplars in In-Context Learning","primary_cat":"cs.LG","submitted_at":"2026-05-25T21:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Task-preserving perturbations of correct exemplars can degrade ICL performance by changing the effective evidence mixture used for inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07555","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Priors Persist Through Suppression: A Stroop Paradigm for Lexical Override","primary_cat":"cs.CL","submitted_at":"2026-05-25T05:53:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pretrained lexical priors in language models persist despite explicit remapping rules, as shown by a Stroop paradigm where prior strength predicts interference and activation patching localizes the repair mechanism.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21974","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Format-Constraint Coupling in Knowledge Graph Construction from Statistical Tables","primary_cat":"cs.AI","submitted_at":"2026-05-21T04:08:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical 2x2 factorial study on 6 statistical datasets shows format and schema constraints in LLM-based KG construction from CSV tables produce super-additive fidelity loss up to +1.180, with mismatched pairs falling below baseline, plus release of CSVFidelity-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18830","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"In-Context Learning Operates as Concept Subspace Learning","primary_cat":"cs.LG","submitted_at":"2026-05-12T21:43:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"In-context learning decomposes into concept-coordinate regression plus off-subspace leakage, with recoverable task information concentrating in a 68-73 dimensional task-aligned subspace of the residual stream that restores 78.8% of the accuracy gap in Llama-3-8B experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08904","ref_index":88,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OPT-BENCH: Evaluating the Iterative Self-Optimization of LLM Agents in Large-Scale Search Spaces","primary_cat":"cs.AI","submitted_at":"2026-05-09T11:51:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPT-BENCH and OPT-Agent evaluate LLM self-optimization in large search spaces, showing stronger models improve via feedback but stay constrained by base capacity and below human performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08295","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"In-Context Fixation: When Demonstrated Labels Override Semantics in Few-Shot Classification","primary_cat":"cs.LG","submitted_at":"2026-05-08T10:20:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In-context learning binds model outputs to the demonstrated label tokens as an exhaustive vocabulary, overriding semantic plausibility and causing fixation even with homogeneous or nonsense labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03780","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Task Vector Geometry Underlies Dual Modes of Task Inference in Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-05T14:07:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In a controlled synthetic setting, transformers implement in-distribution task inference via convex combinations of task vectors and out-of-distribution inference via nearly orthogonal extrapolative representations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23267","ref_index":69,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Fine-tuning vs. In-context Learning in Large Language Models: A Formal Language Learning Perspective","primary_cat":"cs.CL","submitted_at":"2026-04-25T12:19:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A controlled formal language task reveals fine-tuning outperforms in-context learning on in-distribution generalization but equals it on out-of-distribution, with ICL showing greater sensitivity to model size and tokenization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15789","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Systematic Study of Training-Free Methods for Trustworthy Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-17T07:50:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Training-free methods for LLM trustworthiness show inconsistent results across dimensions, with clear trade-offs in utility, robustness, and overhead depending on where they intervene during inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13275","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Better and Worse with Scale: How Contextual Entrainment Diverges with Model Size","primary_cat":"cs.CL","submitted_at":"2026-04-14T20:12:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Contextual entrainment decreases for semantic contexts but increases for non-semantic ones as LLMs scale, following power-law trends with 4x better resistance to misinformation but 2x more copying of arbitrary tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.24164","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Localizing Task Recognition and Task Learning in In-Context Learning via Attention Head Analysis","primary_cat":"cs.CL","submitted_at":"2025-09-29T01:28:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new framework using Task Subspace Logit Attribution localizes attention heads specialized for task recognition and task learning in in-context learning, showing they align and rotate hidden states within a task subspace.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.11295","ref_index":133,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Prompt Engineering Report Distilled: Quick Start Guide for Life Sciences","primary_cat":"cs.CL","submitted_at":"2025-09-14T14:39:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper reduces a broad set of prompt engineering techniques to six core approaches and applies them to life sciences use cases while addressing common LLM pitfalls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.07298","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Pre-trained Large Language Models Learn Hidden Markov Models In-context","primary_cat":"cs.LG","submitted_at":"2025-06-08T21:49:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pre-trained LLMs learn to predict HMM-generated sequences via in-context learning, approaching theoretical optimum on synthetic HMMs and matching expert models on real animal decision data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07974","ref_index":199,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","primary_cat":"cs.SE","submitted_at":"2024-03-12T17:58:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiveCodeBench collects 400 recent contest problems to create a contamination-free benchmark evaluating LLMs on code generation and related capabilities like self-repair and execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.14566","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2023-10-23T04:49:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HallusionBench shows GPT-4V reaches only 31.42% accuracy on paired questions testing language hallucination and visual illusion in LVLMs, with other models below 16%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.03409","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Large Language Models as Optimizers","primary_cat":"cs.LG","submitted_at":"2023-09-07T00:07:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Large language models can optimize by being prompted with histories of past solutions and scores to propose better ones, producing prompts that raise accuracy up to 8% on GSM8K and 50% on Big-Bench Hard over human-designed baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.18565","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaLI-X: On Scaling up a Multilingual Vision and Language Model","primary_cat":"cs.CV","submitted_at":"2023-05-29T18:58:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Scaling a multilingual vision-language model in size and training breadth yields new state-of-the-art results on over 25 benchmarks plus emerging abilities in counting and multilingual detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}