{"total":13,"items":[{"citing_arxiv_id":"2605.12004","ref_index":84,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning Agentic Policy from Action Guidance","primary_cat":"cs.CL","submitted_at":"2026-05-12T11:54:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ActGuide-RL uses human action data as plan-style guidance in mixed-policy RL to overcome exploration barriers in LLM agents, matching SFT+RL performance on search benchmarks without cold-start training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05191","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LongSeeker: Elastic Context Orchestration for Long-Horizon Search Agents","primary_cat":"cs.AI","submitted_at":"2026-05-06T17:54:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Context-ReAct enables agents to dynamically manage context via five atomic operations, and LongSeeker fine-tuned on 10k trajectories achieves 61.5% and 62.5% on BrowseComp benchmarks, outperforming prior agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24929","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GAIA-v2-LILT: Multilingual Adaptation of Agent Benchmark beyond Translation","primary_cat":"cs.CL","submitted_at":"2026-04-27T19:11:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new workflow for multilingual agent benchmark adaptation using functional, cultural, and difficulty alignments improves non-English agent success rates by up to 32.7% over simple machine translation, indicating substantial benchmark-induced measurement error in prior multilingual evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19859","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DR-Venus: Towards Frontier Edge-Scale Deep Research Agents with Only 10K Open Data","primary_cat":"cs.LG","submitted_at":"2026-04-21T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A 4B deep research agent trained on 10K open data outperforms prior agents under 9B parameters and narrows the gap to 30B-class systems on research benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17091","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GenericAgent: A Token-Efficient Self-Evolving LLM Agent via Contextual Information Density Maximization (V1.0)","primary_cat":"cs.CL","submitted_at":"2026-04-18T17:59:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GenericAgent outperforms other LLM agents on long-horizon tasks by maximizing context information density with fewer tokens via minimal tools, on-demand memory, trajectory-to-SOP evolution, and compression.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14518","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mind DeepResearch Technical Report","primary_cat":"cs.AI","submitted_at":"2026-04-16T01:20:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MindDR combines a Planning Agent, DeepSearch Agent, and Report Agent with SFT cold-start, Search-RL, Report-RL, and preference alignment to reach competitive scores on research benchmarks using 30B-scale models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14448","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MARCA: A Checklist-Based Benchmark for Multilingual Web Search","primary_cat":"cs.CL","submitted_at":"2026-04-15T21:54:27+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MARCA is a bilingual benchmark using 52 questions and validated checklists to evaluate LLM web-search completeness and correctness in English and Portuguese.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07720","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Knowledgeable Deep Research: Framework and Benchmark","primary_cat":"cs.AI","submitted_at":"2026-04-09T02:06:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces the KDR task, HKA multi-agent framework, and KDR-Bench to enable LLM agents to integrate structured knowledge into deep research reports, with experiments showing outperformance over prior agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05952","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Trustworthy Report Generation: A Deep Research Agent with Progressive Confidence Estimation and Calibration","primary_cat":"cs.AI","submitted_at":"2026-04-07T14:46:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A deep research agent incorporates progressive confidence estimation and calibration to produce trustworthy reports with transparent confidence scores on claims.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03679","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LightThinker++: From Reasoning Compression to Memory Management","primary_cat":"cs.CL","submitted_at":"2026-04-04T10:46:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LightThinker++ adds explicit adaptive memory management and a trajectory synthesis pipeline to LLM reasoning, cutting peak token use by ~70% while gaining accuracy in standard and long-horizon agent tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.15763","ref_index":63,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GLM-5: from Vibe Coding to Agentic Engineering","primary_cat":"cs.LG","submitted_at":"2026-02-17T17:50:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GLM-5 is a foundation model that claims state-of-the-art results on coding benchmarks and superior performance on end-to-end software engineering tasks via new asynchronous RL methods and cost-saving DSA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.02556","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models","primary_cat":"cs.CL","submitted_at":"2025-12-02T09:25:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeepSeek-V3.2 adds sparse attention, scaled RL post-training, and large-scale agentic data synthesis to reach GPT-5-level performance and gold medals in 2025 IMO and IOI with its high-compute variant.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.02544","ref_index":89,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UI-TARS-2 Technical Report: Advancing GUI Agent with Multi-Turn Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2025-09-02T17:44:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UI-TARS-2 reaches 88.2 on Online-Mind2Web, 47.5 on OSWorld, 50.6 on WindowsAgentArena, and 73.3 on AndroidWorld while attaining 59.8 mean normalized score on a 15-game suite through multi-turn RL and scalable data generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}