{"total":11,"items":[{"citing_arxiv_id":"2606.21654","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ChainWorld: Composing Long-Horizon Desktop Workloads from Atomic OSWorld Tasks","primary_cat":"cs.AI","submitted_at":"2026-06-19T18:00:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ChainWorld builds 347 chains from atomic OSWorld tasks and benchmarks four agents under single-turn and multi-turn protocols, reporting a maximum 31% completion rate with distinct failure profiles.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17628","ref_index":116,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OPD-Evolver: Cultivating Holistic Agent Evolver via On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-06-16T07:33:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPD-Evolver uses on-policy self-distillation in fast interaction and slow attribution loops to build agents with holistic memory competence, outperforming prior systems by up to 11.5% and allowing a 9B model to compete with much larger ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06462","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Benchmark Everything Everywhere All at Once","primary_cat":"cs.AI","submitted_at":"2026-06-04T17:52:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Benchmark Agent is an autonomous agentic system that constructs benchmarks for LLMs and MLLMs via query analysis, subtask design, annotation and quality control, yielding 15 benchmarks with minimal human input.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22564","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SynAE: A Framework for Measuring the Quality of Synthetic Data for Tool-Calling Agent Evaluations","primary_cat":"cs.CL","submitted_at":"2026-05-21T14:45:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SynAE is a multi-metric framework that evaluates how well synthetic benchmarks replicate real data characteristics for multi-turn tool-calling agent testing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18660","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evaluating Multi-turn Human-AI Interaction","primary_cat":"cs.HC","submitted_at":"2026-05-18T17:08:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces the TCR framework to evaluate educational LLM assistants on transparency, consistency, and refinement in multi-turn interactions, complementing aggregate metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04399","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GUIDE: Interpretable GUI Agent Evaluation via Hierarchical Diagnosis","primary_cat":"cs.AI","submitted_at":"2026-04-06T03:58:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GUIDE decomposes GUI agent evaluation into trajectory segmentation, subtask diagnosis, and overall summary to deliver higher accuracy and structured error reports than holistic baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.15808","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Inference-Time Scaling of Verification: Self-Evolving Deep Research Agents via Test-Time Rubric-Guided Verification","primary_cat":"cs.AI","submitted_at":"2026-01-22T09:47:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepVerifier enables self-evolving deep research agents via rubric-guided verification at test time, delivering 8-11% accuracy gains on GAIA and XBench-DeepSearch subsets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.19396","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EchoTrail-GUI: Building Actionable Memory for GUI Agents via Critic-Guided Self-Exploration","primary_cat":"cs.AI","submitted_at":"2025-12-22T13:42:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EchoTrail-GUI builds an automated memory of successful GUI task trajectories via self-exploration and injects relevant past examples to raise success rates on Android benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.09572","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Plan-and-Act: Improving Planning of Agents for Long-Horizon Tasks","primary_cat":"cs.CL","submitted_at":"2025-03-12T17:40:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Plan-and-Act trains a dedicated Planner on synthetic plan-annotated trajectories to generate high-level plans that an Executor follows, reaching 57.58% success on WebArena-Lite and 81.36% on WebVoyager.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.16150","ref_index":119,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Comprehensive Survey of Agents for Computer Use: Foundations, Challenges, and Future Directions","primary_cat":"cs.AI","submitted_at":"2025-01-27T15:44:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey of 87 agents for computer use and 33 datasets that introduces a three-dimensional taxonomy across domain, interaction, and agent perspectives and identifies six research gaps.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[118] Chen Qian, Wei Liu, Hongzhang Liu, Nuo Chen, Yufan Dang, Jiahao Li, Cheng Yang, Weize Chen, Yusheng Su, Xin Cong, Juyuan Xu, Dahai Li, Zhiyuan Liu, and Maosong Sun. 2024. ChatDev: Communicative agents for software development. InProc. of the 62nd Annual Meeting of the ACL. ACL, Bangkok, Thailand, 15174-15186. https://doi.org/10.18653/v1/2024.acl-long.810 [119] Yujia Qin, Shihao Liang, Yining Ye, Kunlun Zhu, Lan Yan, Yaxi Lu, Yankai Lin, Xin Cong, Xiangru Tang, Bill Qian, Sihan Zhao, Lauren Hong, Runchu Tian, Ruobing Xie, Jie Zhou, Mark Gerstein, Dahai Li, Zhiyuan Liu, and Maosong Sun. 2024. ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs. InProc. of the 12th ICLR. OpenReview."},{"citing_arxiv_id":"2409.07429","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agent Workflow Memory","primary_cat":"cs.CL","submitted_at":"2024-09-11T17:21:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AWM induces reusable workflows from agent experiences and provides them selectively to improve success rates by 24.6% on Mind2Web and 51.1% on WebArena while reducing steps taken.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}