{"total":101,"items":[{"citing_arxiv_id":"2606.31156","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"One Retrieval to Cover Them All: Co-occurrence-Aware Knowledge Base Reorganization for Session-Level RAG","primary_cat":"cs.IR","submitted_at":"2026-06-30T05:35:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Co-occurrence-aware KB clustering raises session-level RAG coverage from 41% to 58% on WixQA while cutting retrieval calls and compressing the KB to 20% of original size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30306","ref_index":48,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Always-OnAgents:A Survey of Persistent Memory, State, and Governance in LLMAgents","primary_cat":"cs.MA","submitted_at":"2026-06-29T13:47:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Survey mapping persistent state in LLM agents along six axes and proposing the AOEP-v0 protocol to evaluate governance and recovery obligations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30175","ref_index":109,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CORTEX: High-Quality Cross-Domain Organization of Web-Scale Corpora through Ontological Corpus Graph","primary_cat":"cs.CL","submitted_at":"2026-06-29T11:51:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Cortex uses an Ontological Corpus Graph to structure web-scale corpora, creating a refined 24.14B-token corpus and a new benchmark validated on eight LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28044","ref_index":260,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Tree-of-Thoughts Inspired Hybrid Approach for Legal Case Judgement Summarization using LLMs","primary_cat":"cs.CL","submitted_at":"2026-06-26T12:46:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A tree-of-thoughts inspired hybrid extractive-abstractive LLM prompt yields better legal case judgment summaries than standard extractive or abstractive prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27786","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SHIFT: Gate-Modulated Activation Steering for Knowledge Conflict Mitigation in Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-06-26T07:17:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SHIFT reformulates neuron editing as learnable gate modulation on under 0.01% parameters to let LLMs adaptively balance contextual and parametric knowledge during RAG generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25721","ref_index":56,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Tracing Target Answers in Poisoned Retrieval Corpora via Token Influence Attribution","primary_cat":"cs.CR","submitted_at":"2026-06-24T11:39:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TRACE detects corpus poisoning in RAG via token influence attribution to find recurrent keywords tied to target answers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22645","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"All Relations Lead to Rome: Automated Knowledge Graph Creation and Question Generation","primary_cat":"cs.IR","submitted_at":"2026-06-21T19:09:54+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21409","ref_index":56,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Don't Blindly Trust It: How Unreliable Feedback Breaks Tool-Using LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-06-19T13:21:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Misleading tool feedback produces value inversion in LLM agents, with performance dropping below matched no-feedback baselines on HotpotQA and similar tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20475","ref_index":37,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Marginal Advantage Accumulation for Memory-Driven Agent Self-Evolution","primary_cat":"cs.LG","submitted_at":"2026-06-18T16:54:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAA formalizes alignability and comparability conditions and uses differential signals, EMA accumulation, and semantic identity merging to enable cross-batch operation-level evidence accumulation, outperforming batch-level baselines in 14 of 16 settings while matching online methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19667","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CacheWeaver: Cache-Aware Evidence Ordering for Efficient Grounded RAG Inference","primary_cat":"cs.CL","submitted_at":"2026-06-18T00:38:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"CacheWeaver is a lightweight scheduling layer that orders evidence to exploit prefix caching, reducing median TTFT by 20-33% across vLLM setups while preserving answer quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19605","ref_index":34,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"FAPO: Fully Automated Prompt Optimization of Multi-Step LLM Pipelines","primary_cat":"cs.SE","submitted_at":"2026-06-17T21:16:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FAPO automates LLM pipeline optimization via iterative diagnosis and prompt-or-structure edits, beating GEPA baseline by +14.1 pp mean across 18 comparisons and +33.8 pp when structural changes occur.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18508","ref_index":39,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MCompassRAG: Topic Metadata as a Semantic Compass for Paragraph-Level Retrieval","primary_cat":"cs.CL","submitted_at":"2026-06-16T21:50:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MCompassRAG adds topic metadata to chunk representations and uses LLM distillation to train a lightweight topic-aware retriever, reporting 8.24% average information efficiency gain and over 5x lower latency than strong baselines across six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18381","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SproutRAG: Attention-Guided Tree Search with Progressive Embeddings for Long-Document RAG","primary_cat":"cs.CL","submitted_at":"2026-06-16T18:28:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SproutRAG introduces an attention-guided hierarchical framework that constructs a binary chunking tree for multi-granularity retrieval in RAG systems and reports a 6.1% average gain in information efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18056","ref_index":36,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ConSA: Controllable Sparsity in Hybrid Attention via Learnable Allocation","primary_cat":"cs.CL","submitted_at":"2026-06-16T15:33:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ConSA learns FA/SWA allocation via L0 masks and augmented Lagrangian constraints, outperforming rule-based baselines on 0.6B and 1.7B models with consistent layer patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17468","ref_index":54,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"RSRank: Learning Relevance from Representational Shifts","primary_cat":"cs.IR","submitted_at":"2026-06-16T03:29:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RSRank learns calibrated relevance scores from alignment between representational shifts induced by candidate documents and those from oracle document sets, enabling zero-threshold filtering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13814","ref_index":24,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TASR: Training-Free Adaptive Stopping for Iterative Retrieval","primary_cat":"cs.IR","submitted_at":"2026-06-11T18:35:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TASR provides a training-free predicate that stops iterative retrieval on repeated normalized answers plus calibrated logit margin above 0.25, retaining 94.8% of fixed-k=5 F1 at 62.6% of the calls across 32 configurations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12837","ref_index":1,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LoHoSearch: Benchmarking Long-Horizon Search Agents Beyond the Human Difficulty Ceiling","primary_cat":"cs.CL","submitted_at":"2026-06-11T03:04:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LoHoSearch is a new benchmark of 544 KG-constructed questions across 11 domains where the strongest search agent scores 34.74% and context strategies add at most 6.8%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12767","ref_index":20,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Constructing Evaluation Datasets for Procedural Reasoning: Balancing Naturalness, Grounding, and Multi-Hop Coverage","primary_cat":"cs.AI","submitted_at":"2026-06-11T00:17:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Strict generation directly from Task-Method-Knowledge models yields 96.5% grounded and 92.6% usable QA pairs across 23 topics, outperforming transcript-first and TMK-aware alternatives on representational grounding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12203","ref_index":75,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Adaptive Multi-Resolution Procedural Knowledge Compression for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-10T15:21:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SKIM is an adaptive multi-resolution soft-token framework that compresses procedural skills while aiming to preserve logical dependencies and task performance better than prior compression methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12479","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ReCal: Reward Calibration for RL-based LLM Routing","primary_cat":"cs.LG","submitted_at":"2026-06-10T06:59:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReCal introduces hierarchical reward decomposition and distribution-aware optimization to address ambiguous credit assignment and optimization bias in RL-based LLM routing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12469","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Influence Factors on RAG Poisoning","primary_cat":"cs.CR","submitted_at":"2026-06-09T19:07:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Factorial experiment identifies retriever type, dataset, and retrieval depth as strongest influences on RAG poisoning exposure across 432 configurations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07218","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"HKVM-RAG: Key-Value-Separated Hypergraph Evidence Organization for Multi-Hop RAG","primary_cat":"cs.IR","submitted_at":"2026-06-05T12:31:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HKVM-RAG uses key-value-separated hypergraphs to organize LLM evidence tuples into answer-path hyperedges, yielding F1 gains over KG-PPR on two multi-hop QA benchmarks and further gains when combined with dense retrievers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06758","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Diagnosing Evidence Utilization in Long-Context and Retrieval-Augmented Language Models under Matched Evidence Conditions","primary_cat":"cs.CL","submitted_at":"2026-06-04T22:44:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a matched four-condition protocol and ONCU metric to diagnose evidence utilization in long-context and RAG models across synthetic and multi-hop QA tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06087","ref_index":51,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LatentSkill: From In-Context Textual Skills to In-Weight Latent Skills for LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-06-04T12:26:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LatentSkill uses a hypernetwork to generate LoRA adapters from textual skills, enabling weight-space storage that cuts prefill tokens and boosts agent success rates on ALFWorld and Search-QA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05901","ref_index":61,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Reducing Hallucinations in Complex Question Answering using Simple Graph-based Retrieval-Augmented Generation (long version)","primary_cat":"cs.CL","submitted_at":"2026-06-04T09:07:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A graph-augmented RAG system with vector and graph query tools halves hallucinations and raises factual correctness scores on the MoNaCo complex QA benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05304","ref_index":80,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"What Should Agents Say? Action-state Communication for Efficient Multi-Agent Systems","primary_cat":"cs.AI","submitted_at":"2026-06-03T18:00:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces PACT protocol that projects agent outputs into action-state records, yielding comparable or better task performance with substantially fewer tokens in multi-agent LLM systems and production harnesses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05054","ref_index":162,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Boosting Self-Consistency with Ranking","primary_cat":"cs.CL","submitted_at":"2026-06-03T16:12:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RISC reformulates self-consistency answer selection as a ranking task solved by a lightweight LambdaRank model with five hand-designed features, yielding better accuracy-efficiency trade-offs than majority voting on QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04612","ref_index":47,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hybrid Adversarial Defence for Natural Language Understanding Tasks","primary_cat":"cs.CL","submitted_at":"2026-06-03T08:49:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Hybrid entropy-uncertainty-geometric defence improves clean accuracy by up to 43% and adversarial robustness by up to 65% on NLU and security benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04315","ref_index":48,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Exploring Cross-Scenario Generality of Agentic Memory Systems: Diagnostics and a Strong Baseline","primary_cat":"cs.AI","submitted_at":"2026-06-03T00:42:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An agentic harness letting the LLM self-manage flat text-file storage via tool calls outperforms eight prior memory systems on cross-scenario generality across QA, chat, trajectory, stress-test, and long-horizon tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04302","ref_index":78,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LazyAttention: Efficient Retrieval-Augmented Generation with Deferred Positional Encoding","primary_cat":"cs.CL","submitted_at":"2026-06-03T00:12:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LazyAttention kernelizes deferred positional encoding to enable zero-copy, position-agnostic KV cache reuse, delivering 1.37× lower TTFT and 1.40× higher throughput than Block-Attention under skewed document distributions while preserving output quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03329","ref_index":14,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"InfoMem: Training Long-Context Memory Agents with Answer-Conditioned Information Gain","primary_cat":"cs.AI","submitted_at":"2026-06-02T08:39:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InfoMem is an answer-conditioned information gain reward for RL training of long-context memory agents that improves performance when applied to successful trajectories and normalized.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03197","ref_index":14,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MemTrain: Self-Supervised Context Memory Training","primary_cat":"cs.CL","submitted_at":"2026-06-02T05:56:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemTrain introduces two coupled self-supervised proxy tasks on Wikipedia corpora to train general context-memory capabilities in LLMs, reporting gains of up to 17.67 points on long-text and search-based QA benchmarks over direct post-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02488","ref_index":1,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"RASER: Recoverability-Aware Selective Escalation Router for Multi-Hop Question Answering","primary_cat":"cs.AI","submitted_at":"2026-06-01T16:59:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RASER routers built on one-shot RAG features selectively escalate retrieval, matching SOTA F1 scores on multi-hop QA while using 41-49% of the tokens required by always-prune across six LLMs and three benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02404","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"K-BrowseComp: A Web Browsing Agent Benchmark Grounded in Korean Contexts","primary_cat":"cs.CL","submitted_at":"2026-06-01T15:50:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"K-BrowseComp is a new Korean web-browsing agent benchmark where frontier LLMs score 30-46% and Korean LLMs score 0-10% on the verified subset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02245","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"When Knowledge Is Not Free: Cost-Aware Evidence Selection in Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-06-01T13:39:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Defines cost-aware RAG with evidence cost tiers and shows static selectors are brittle while agentic LLM-based selection is promising but model-dependent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01779","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"HarnessForge: Joint Harness and Policy Evolution for Adaptive Agent Systems","primary_cat":"cs.CL","submitted_at":"2026-06-01T07:00:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HarnessForge co-evolves harness-policy pairs in LLM agents via fault-guided tailoring and alignment, reporting up to 12% gains over single-component baselines on five benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01326","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Reducing Token Usage of State-in-Context Agents using Minification","primary_cat":"cs.SE","submitted_at":"2026-05-31T16:24:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Code minification reduces average input token usage by 42% in state-in-context agents with a 12 percentage point drop in resolution rate on SWE-bench Verified.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01033","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TriLens: Per-Layer Logit-Lens Entropy for White-Box Hallucination Detection","primary_cat":"cs.AI","submitted_at":"2026-05-31T05:48:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TriLens detects hallucinations via per-layer entropy trajectories of logit-lens readouts from three internal modules across LLMs and QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00683","ref_index":50,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"OCC-RAG: Optimal Cognitive Core for Faithful Question Answering","primary_cat":"cs.CL","submitted_at":"2026-05-30T11:42:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OCC-RAG develops task-specialized SLMs (0.6B and 1.7B) via a new synthetic data pipeline for multi-hop reasoning and context faithfulness, claiming to match or exceed 2-6x larger general models on HotpotQA, MuSiQue, TAT-QA, ConFiQA, and MuSiQue-Un.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00593","ref_index":25,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SPADER: Step-wise Peer Advantage with Diversity-Aware Exploration Rewards for Multi-Answer Question Answering","primary_cat":"cs.CL","submitted_at":"2026-05-30T07:47:42+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31105","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GRKV: Global Regression for Training-Free KV Cache Compression in Long-Context LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T10:16:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GRKV applies global ridge regression to KV cache merging for span-based retention in long-context LLMs, claiming to be the only method that improves benchmark performance with minimal overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29307","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GrepSeek: Training Search Agents for Direct Corpus Interaction","primary_cat":"cs.CL","submitted_at":"2026-05-28T03:37:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GrepSeek introduces a two-stage trained agent that uses shell commands for direct corpus search, achieving the strongest token-level F1 and Exact Match on seven open-domain QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29247","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"DenseSteer: Steering Small Language Models towards Dense Math Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-28T02:07:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DenseSteer is an inference-time steering framework that improves small LLMs' accuracy on math reasoning by modulating representations toward dense reasoning patterns with fewer but higher-density steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28721","ref_index":12,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LiveBrowseComp: Are Search Agents Searching, or Just Verifying What They Already Know?","primary_cat":"cs.AI","submitted_at":"2026-05-27T16:39:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LiveBrowseComp shows search agents rely on intrinsic knowledge on standard benchmarks, with scores dropping 25-40 points and closed-book accuracy below 2% on questions about facts from the prior 90 days.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26366","ref_index":45,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Automatic Layer Selection for Hallucination Detection","primary_cat":"cs.AI","submitted_at":"2026-05-25T22:28:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FEPoID automatically selects optimal or near-optimal intermediate layers for hallucination detection across LLM architectures and tasks, outperforming prior criteria and baselines, with an added truncation step that further improves performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25480","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Retrieval as Reasoning: Self-Evolving Agent-Native Retrieval via LLM-Wiki","primary_cat":"cs.CL","submitted_at":"2026-05-25T06:36:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM-Wiki structures external knowledge as compilable wiki pages with links and persistent self-correction, achieving SOTA results on HotpotQA, MuSiQue, and 2WikiMultiHopQA by 2.0-8.1 F1 points over prior RAG systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26165","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Tool-Schema Compression Enables Agentic RAG Under Constrained Context Budgets","primary_cat":"cs.SE","submitted_at":"2026-05-24T20:52:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Tool schema compression by 44-50% enables agentic RAG at 8K context where uncompressed schemas fail, with +20.5 pp exact match lift across models and scaling to over 800 tools.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24879","ref_index":47,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Efficient DP-SGD for LLMs with Randomized Clipping","primary_cat":"cs.LG","submitted_at":"2026-05-24T05:44:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DP-SGD-RC applies Hutchinson and Hutch++ estimators to approximate per-sample gradient norms for clipping in DP-SGD, claiming competitive privacy noise multipliers and utility on Llama 3.2-1B with reduced memory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24022","ref_index":40,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Adaptive KV Cache Reuse for Fast Long-Context LLM Serving","primary_cat":"cs.AR","submitted_at":"2026-05-20T08:59:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CacheTune delivers 3.72x-4.86x TTFT speedup and 3.93x-6.21x throughput in long-context LLM serving via frequency-guided selective KV recomputation and hardware-aware I/O overlap while keeping output quality near full recompute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22863","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Latent Cache Flow: Model-to-Model Communication Without Text","primary_cat":"cs.LG","submitted_at":"2026-05-19T19:21:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Latent Cache Flow uses a small joint-translation-and-compression adapter to let LLMs with different contexts exchange KV cache summaries, outperforming both larger C2C adapters and text in early experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}