{"total":19,"items":[{"citing_arxiv_id":"2606.31651","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FARS: A Fully Automated Research System Deployed at Scale","primary_cat":"cs.AI","submitted_at":"2026-06-30T13:30:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FARS deployed at scale produced 166 AI/ML papers across 67 topics that received 282 structured human reviews indicating some review-worthy outputs alongside recurring failure modes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29645","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Metadata, Structure, or Strategy? A Decomposition of RAG Context Enrichment","primary_cat":"cs.IR","submitted_at":"2026-06-28T23:26:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Controlled experiments across six benchmarks and four models show RAG context enrichment with metadata, structure, or strategies mostly lowers accuracy, with model-context alignment as the determining factor.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12191","ref_index":81,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agentic Environment Engineering for Large Language Models: A Survey of Environment Modeling, Synthesis, Evaluation, and Application","primary_cat":"cs.CL","submitted_at":"2026-06-10T15:15:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"This survey categorizes agentic environments for LLMs by eight attributes and domains, introduces symbolic and neural synthesis paradigms with evaluation, and outlines four agent evolution pathways plus three environment evolution paradigms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30947","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Extending AI for Research to the Humanities: A Multi-Agent Framework for Evidence-Grounded Scholarship","primary_cat":"cs.CL","submitted_at":"2026-05-29T07:33:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPIRE is a multi-agent framework drawing on scholarly primitives to perform evidence-grounded humanities scholarship, outperforming Naive LLM, Text RAG, and GraphRAG on a benchmark of classical Chinese and Greco-Roman Latin papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30824","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Planner-Centric Reinforcement Learning for Deep Research with Structure-Aware Reward","primary_cat":"cs.AI","submitted_at":"2026-05-29T04:18:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DecomposeR represents research plans as typed DAGs and uses two-stage planner-then-answerer RL to improve long-form research performance by 5.1-8.0 points over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27610","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Eliot: Interactively $\\underline{E}$xploring Fast-Changing Scientific $\\underline{Li}$terature Trends with $\\underline{O}$nline Da$\\underline{t}$a and Learning","primary_cat":"cs.IR","submitted_at":"2026-05-26T19:25:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Eliot is a query-time clustering and temporal visualization system for arXiv literature, evaluated via offline metrics on eight domains and a user survey showing 85% meaningful cluster labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27361","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Natural Language Query to Configuration for Retrieval Agents","primary_cat":"cs.AI","submitted_at":"2026-05-26T17:58:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BRANE maps queries to optimal retrieval pipeline configurations using LLM-derived features and per-configuration correctness predictors, improving the cost-quality Pareto frontier on three benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23590","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Co-ReAct: Rubrics as Step-Level Collaborators for ReAct Agents","primary_cat":"cs.AI","submitted_at":"2026-05-22T12:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Co-ReAct adds step-level rubric guidance to ReAct agents via a GRPO-trained generator using list-wise ranking rewards, yielding consistent gains on DeepResearchBench and SQA-CS-V2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22878","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SciAtlas: A Large-Scale Knowledge Graph for Automated Scientific Research","primary_cat":"cs.AI","submitted_at":"2026-05-20T16:03:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SciAtlas builds a large-scale multi-disciplinary academic knowledge graph and a neuro-symbolic retrieval system to support automated scientific research tasks such as literature review and idea positioning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09012","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Re$^2$Math: Benchmarking Theorem Retrieval in Research-Level Mathematics","primary_cat":"cs.AI","submitted_at":"2026-05-09T15:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Re²Math is a new benchmark that evaluates AI models on retrieving and verifying the applicability of theorems from math literature to advance steps in partial proofs, accepting any sufficient theorem while controlling for leakage.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"after the anchor is supplied. These metrics localize failures before source selection and extraction; they are not part of the primary leaderboard. 23 Table 8:Uncertainty for ToolAcc.Wilson intervals use instance-level binomial uncertainty; paper- cluster intervals resample citing papers. ModelToolAccWilson 95% CI Paper-cluster 95% CI Claude Opus 4.5 7.0% (14/200) [4.2%, 11.4%] [3.6%, 10.7%] Grok 4 3.5% (7/200) [1.7%, 7.0%] [1.0%, 6.6%] Kimi K2 Thinking 3.5% (7/200) [1.7%, 7.0%] [1.0%, 6.4%] GPT-5.2 3.0% (6/200) [1.4%, 6.4%] [1.0%, 5.5%] DeepSeek V3.2 2.5% (5/200) [1.1%, 5.7%] [0.5%, 5.0%] Gemini 3.1 Pro 2.0% (4/200) [0.8%, 5.0%] [0.5%, 4.0%] Qwen3-235B Thinking 1.0% (2/200) [0.3%, 3.6%] [0.0%, 2.6%] Table 9:Domain-levelToolAccon Eval-200."},{"citing_arxiv_id":"2604.04074","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FactReview: Evidence-Grounded Reviews with Literature Positioning and Execution-Based Claim Verification","primary_cat":"cs.AI","submitted_at":"2026-04-05T11:45:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FactReview extracts claims from ML papers, positions them via literature retrieval, and verifies them through code execution, labeling each as Supported, Partially supported, or In conflict, as shown in a CompGCN case study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02988","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Self-Optimizing Multi-Agent Systems for Deep Research","primary_cat":"cs.IR","submitted_at":"2026-04-03T11:48:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-agent deep research systems self-optimize prompts through self-play to match or outperform expert-crafted versions.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"We then select one agent to optimize and generate agent-specific feedback for each query inB. This feedback is used to generate a new prompt for the selected agent, yielding a new candidate system C ′. Another key difference from TextGrad is thatC′ is discarded if it does not improve overConB. 4 Experiments Forourexperiments,weadapttheexperimentalsetupproposedbyScholarQA[2]. Specifically, we use the Computer Science subset of queries. This dataset com- prises 109 query-rubric pairs written by PhD-level experts in Computer Science. For each query, the rubrics contain a list of criteria that constitute a \"good\" an- swer, marked as either \"most important\" or \"nice to have\" items. The evaluation is performed by an LLM-as-judge on a per-rubric basis."},{"citing_arxiv_id":"2603.05308","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Med-V1: Small Language Models for Zero-shot and Scalable Biomedical Evidence Attribution","primary_cat":"cs.CL","submitted_at":"2026-03-05T15:48:43+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.07689","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stress Testing Factual Consistency Metrics for Long-Document Summarization","primary_cat":"cs.CL","submitted_at":"2025-11-10T23:24:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Short-form factual consistency metrics produce inconsistent scores on semantically equivalent long-document summaries and lose reliability on information-dense claims.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.00361","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Attribution Gradients: Incrementally Unfolding Citations for Critical Examination of Attributed AI Answers","primary_cat":"cs.HC","submitted_at":"2025-10-01T00:07:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attribution gradients consolidate citation evidence and enable incremental unfolding of secondary sources, leading to deeper engagement in a lab study of critical reading tasks for AI answers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.26574","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Probing the Critical Point (CritPt) of AI Reasoning: a Frontier Physics Research Benchmark","primary_cat":"cs.AI","submitted_at":"2025-09-30T17:34:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"CritPt benchmark shows state-of-the-art LLMs reach only 5.7% average accuracy on full-scale unpublished physics research tasks, rising to about 10% with coding tools.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Enabling large language models to generate text with citations. InThe 2023 Conference on Empirical Methods in Natural Language Processing, 2023. [10] Y . Wang, Q. Guo, W. Yao, H. Zhang, X. Zhang, Z. Wu, M. Zhang, X. Dai, Q. Wen, W. Ye, et al. Autosurvey: Large Language Models can automatically write surveys.Advances in neural information processing systems, 37:115119-115145, 2024. [11] A. Asai, J. He, R. Shao, W. Shi, A. Singh, J. C. Chang, K. Lo, L. Soldaini, S. Feldman, M. D'arcy, et al. Openscholar: Synthesizing scientific literature with retrieval-augmented LMs.arXiv preprint arXiv:2411.14199, 2024. [12] M. D. Skarlinski, S. Cox, J. M. Laurent, J. D. Braza, M. Hinks, M. J. Hammerling, M. Ponnapati, S. G. Rodriques, and A. D."},{"citing_arxiv_id":"2509.23986","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TusoAI: Agentic Optimization for Scientific Methods","primary_cat":"cs.AI","submitted_at":"2025-09-28T17:30:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TusoAI is an LLM-based agent that builds and iteratively optimizes domain-specific computational methods for scientific data analysis, outperforming expert baselines on RNA-seq denoising and earth monitoring while reporting new genetic associations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.14838","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"In-depth Research Impact Summarization through Fine-Grained Temporal Citation Analysis","primary_cat":"cs.DL","submitted_at":"2025-05-20T19:11:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A framework for nuanced, time-aware research impact summarization using fine-grained temporal citation intents shows moderate to strong correlation with human judgments on insightfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.11336","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"XtraGPT: Context-Aware and Controllable Academic Paper Revision via Human-AI Collaboration","primary_cat":"cs.CL","submitted_at":"2025-05-16T15:02:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"XtraGPT is a suite of 1.5B-14B parameter open-source LLMs fine-tuned on 140,000 revision pairs from 7,000 top-tier papers to support controllable, context-aware academic paper editing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}