{"total":29,"items":[{"citing_arxiv_id":"2605.23262","ref_index":50,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Design and Report Benchmarks for Knowledge Work","primary_cat":"cs.AI","submitted_at":"2026-05-22T06:03:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes a three-step benchmark design method (define work activity, specify tested setting, score work product) derived from work studies and O*NET, demonstrated via three case analyses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22769","ref_index":11,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Understanding Data Temporality Impact on Large Language Models Pre-training","primary_cat":"cs.CL","submitted_at":"2026-05-21T17:31:17+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22389","ref_index":56,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Unified Data Selection for LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-21T12:21:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"High-Entropy Sum (HES) selects high-quality reasoning data for LLMs by summing entropy of the top highest-entropy tokens, matching full-dataset performance with top 20% in SFT and outperforming baselines in RFT and RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21856","ref_index":28,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"The Illusion of Reasoning: Exposing Evasive Data Contamination in LLMs via Zero-CoT Truncation","primary_cat":"cs.LG","submitted_at":"2026-05-21T01:06:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ZCP detects direct and evasive data contamination in LLMs by truncating CoT reasoning and contrasting zero-CoT accuracy on original versus perturbed isomorphic datasets, plus a Contamination Confidence metric.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14537","ref_index":5,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Cattle Trade: A Multi-Agent Benchmark for LLM Bluffing, Bidding, and Bargaining","primary_cat":"cs.AI","submitted_at":"2026-05-14T08:20:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Cattle Trade benchmark shows heuristic code agents outperforming most LLMs in integrated strategic tasks like bidding, bluffing, and resource allocation across 242 games, with strategic coherence predicting rank better than spending volume.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11712","ref_index":89,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Toward Stable Value Alignment: Introducing Independent Modules for Consistent Value Guidance","primary_cat":"cs.AI","submitted_at":"2026-05-12T08:02:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SVGT adds independent value modules and Bridge Tokens to LLMs to maintain consistent value guidance, cutting harmful outputs by over 70% in tests while preserving fluency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11632","ref_index":19,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Macro: Enhancing Multilingual Counterfactual Explanations through Alignment-as-Preference Optimization","primary_cat":"cs.CL","submitted_at":"2026-05-12T06:56:18+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10889","ref_index":9,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Unmasking On-Policy Distillation: Where It Helps, Where It Hurts, and Why","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:33:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Distillation signals align better with ideal updates on incorrect student rollouts than correct ones, with optimal teacher context depending on student capacity and task.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09907","ref_index":28,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"RADAR: Redundancy-Aware Diffusion for Multi-Agent Communication Structure Generation","primary_cat":"cs.AI","submitted_at":"2026-05-11T02:50:40+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09751","ref_index":20,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Language Models Without a Trainable Input Embedding Table: Learning from Fixed Minimal Binary Token Codes","primary_cat":"cs.CL","submitted_at":"2026-05-10T21:00:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fixed 16-bit binary token codes can replace trainable input embeddings in 32-layer decoder-only models while maintaining comparable held-out perplexity on 17B tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08842","ref_index":47,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"XPERT: Expert Knowledge Transfer for Effective Training of Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-09T09:53:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"XPERT extracts and reuses cross-domain expert knowledge from pre-trained MoE LLMs via inference analysis and tensor decomposition to improve performance and convergence in downstream language model training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13875","ref_index":2,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Common-agency Games for Multi-Objective Test-Time Alignment","primary_cat":"cs.GT","submitted_at":"2026-05-08T06:56:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAGE uses common-agency games and an EPEC algorithm to compute equilibrium policies that balance multiple conflicting objectives for test-time LLM alignment.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"We start with the existence of the equilibrium. Lemma 2 Consider the common-agency game with a finite action spaceA where |A| = N≥ 2. Assume the agent's policy is derived from a KL-regularized objective with a temperature parameter τ > 0and a full-support base policyπ0 ∈int (∆N−1). Then, the game admits at least one pure-strategy Nash equilibrium {yj⋆}J j=1, π⋆\u0001 . 21 Proof[Proof of Lemma 2] The proof proceeds by transforming the principals' joint optimiza- tion problem from the individual transfer space into the shared policy space (the probability simplex)π∈∆ N−1. Transformation to the Policy Space.Following the standard characterization of common-agency equilibria (Bernheim and Whinston, 1986), finding a pure-strategy equi- librium is equivalent to finding an aggregate transferY ⋆ and a policy π⋆ that maximize"},{"citing_arxiv_id":"2605.16350","ref_index":26,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Federated Nested Learning: Collaborative Training of Self-Referential Memories for Test-Time Adaptation","primary_cat":"cs.LG","submitted_at":"2026-05-08T04:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FedNL reformulates federated learning as nested optimization with linear attention for collaborative test-time adaptation on non-IID data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05415","ref_index":39,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Information Theoretic Adversarial Training of Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-06T20:20:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WARDEN is a new adversarial training framework for large language models that minimizes worst-case loss over an f-divergence ambiguity set, reducing attack success rates while keeping utility comparable to recent baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03971","ref_index":71,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Logical Consistency as a Bridge: Improving LLM Hallucination Detection via Label Constraint Modeling between Responses and Self-Judgments","primary_cat":"cs.CL","submitted_at":"2026-05-05T16:53:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LaaB improves LLM hallucination detection by mapping self-judgment labels back into neural feature space and using mutual learning under logical consistency constraints between responses and meta-judgments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01147","ref_index":1,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Position: Safety and Fairness in Agentic AI Depend on Interaction Topology, Not on Model Scale or Alignment","primary_cat":"cs.AI","submitted_at":"2026-05-01T22:49:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"In agentic AI, safety and fairness are governed by interaction topology rather than model scale or alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21100","ref_index":120,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Preconditioned DeltaNet: Curvature-aware Sequence Modeling for Linear Recurrences","primary_cat":"cs.LG","submitted_at":"2026-04-22T21:38:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Preconditioned delta-rule models with a diagonal curvature approximation improve upon standard DeltaNet, GDN, and KDA by better approximating the test-time regression objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19398","ref_index":63,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"GRASPrune: Global Gating for Budgeted Structured Pruning of Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-04-21T12:26:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRASPrune removes 50% of parameters from LLaMA-2-7B via global gating and projected straight-through estimation, reaching 12.18 WikiText-2 perplexity and competitive zero-shot accuracy after four epochs on 512 calibration sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18389","ref_index":40,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Understanding the Prompt Sensitivity","primary_cat":"cs.CL","submitted_at":"2026-04-20T15:13:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs disperse meaning-preserving prompts internally instead of clustering them, which produces an excessively high upper bound on output log-probability differences via Taylor expansion and Cauchy-Schwarz.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18660","ref_index":51,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Evaluating Answer Leakage Robustness of LLM Tutors against Adversarial Student Attacks","primary_cat":"cs.CR","submitted_at":"2026-04-20T11:29:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM tutors leak answers under adversarial student attacks, but a fine-tuned jailbreak agent and simple defenses can benchmark and improve robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17785","ref_index":27,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Forget What Matters, Keep the Rest: Selective Unlearning of Informative Tokens","primary_cat":"cs.CL","submitted_at":"2026-04-20T04:20:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ETW uses predictive entropy as a proxy for token informativeness to improve selective unlearning in LLMs, achieving better forgetting with less utility loss than prior token-level methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.18449","ref_index":190,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution","primary_cat":"cs.SE","submitted_at":"2025-02-25T18:45:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SWE-RL uses RL on software evolution data to train LLMs achieving 41% on SWE-bench Verified with generalization to other reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.08435","ref_index":60,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Automated Design of Agentic Systems","primary_cat":"cs.AI","submitted_at":"2024-08-15T21:59:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Meta Agent Search uses a meta-agent to iteratively program novel agentic systems in code, producing agents that outperform state-of-the-art hand-designed ones across coding, science, and math while transferring across domains and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.10162","ref_index":174,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Sycophancy to Subterfuge: Investigating Reward-Tampering in Large Language Models","primary_cat":"cs.AI","submitted_at":"2024-06-14T16:26:20+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs trained on simple specification gaming generalize to zero-shot reward tampering including rewriting their own reward function.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.14782","ref_index":37,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Lessons from the Trenches on Reproducible Evaluation of Language Models","primary_cat":"cs.CL","submitted_at":"2024-05-23T16:50:49+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.18796","ref_index":50,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models","primary_cat":"cs.CL","submitted_at":"2024-04-29T15:33:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A panel of smaller diverse LLMs outperforms a single large model as an evaluator of generations, showing less intra-model bias and over 7x lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.16867","ref_index":205,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"The Falcon Series of Open Language Models","primary_cat":"cs.CL","submitted_at":"2023-11-28T15:12:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Falcon-180B is a 180B-parameter open decoder-only model trained on 3.5 trillion tokens that approaches PaLM-2-Large performance at lower cost and is released with dataset extracts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.14508","ref_index":13,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding","primary_cat":"cs.CL","submitted_at":"2023-08-28T11:53:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LongBench is the first bilingual multi-task benchmark for long context understanding in LLMs, containing 21 datasets in 6 categories with average lengths of 6711 words (English) and 13386 characters (Chinese).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2301.12652","ref_index":94,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"REPLUG: Retrieval-Augmented Black-Box Language Models","primary_cat":"cs.CL","submitted_at":"2023-01-30T04:18:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"REPLUG improves frozen black-box LMs by prepending LM-supervised retrieved documents, delivering 6.3% better language modeling on GPT-3 and 5.1% better five-shot MMLU on Codex.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}