{"total":24,"items":[{"citing_arxiv_id":"2606.31612","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Memory Do GUI Agents Really Need? From Passive Records to Active Task-Driving States","primary_cat":"cs.CV","submitted_at":"2026-06-30T13:01:19+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31410","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Xiaomi-GUI-0 Technical Report","primary_cat":"cs.AI","submitted_at":"2026-06-30T09:36:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Xiaomi-GUI-0 reports 72.0% success on RealMobile and 78.9% on AndroidWorld via real-device closed-loop training with multi-source data and three-stage RL pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30084","ref_index":93,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"One Forward Beats Two: InnerZoom for Accurate and Efficient GUI Grounding","primary_cat":"cs.CV","submitted_at":"2026-06-29T10:20:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InnerZoom bridges cross-layer evidence in one forward pass to achieve SOTA GUI grounding accuracy on six benchmarks while cutting latency up to 31.8% versus two-pass baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29705","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GUICrafter: Weakly-Supervised GUI Agent Leveraging Massive Unannotated Screenshots","primary_cat":"cs.AI","submitted_at":"2026-06-29T02:16:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GUICrafter uses curriculum learning on unannotated GUI screenshots for visual grounding followed by RL calibration on limited labels to match or exceed prior GUI agents with far less annotation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07027","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StainFlow: Entity-Stain Tracking and Evidence Linking for Process Rewards in GUI Agents","primary_cat":"cs.AI","submitted_at":"2026-06-05T08:17:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StainFlow proposes global entity stain tracking and local stain evidence linking modules to improve process rewards for GUI agents, reporting 3.2% relative gain in online RL success and 1.8% in judgment accuracy on AndroidWorld and OGRBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04627","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MIRAGE: Mobile Agents with Implicit Reasoning and Generative World Models","primary_cat":"cs.AI","submitted_at":"2026-06-03T09:01:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MIRAGE compresses explicit chain-of-thought into latent vectors and adds a generative world model to predict future interface states, matching explicit reasoning performance with 3-5x fewer tokens on Android benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01414","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agent Skills Should Go Beyond Text: The Case for Visual Skills","primary_cat":"cs.CV","submitted_at":"2026-05-31T19:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper proposes that reusable agent skills should incorporate visual elements alongside text, introduces three forms of visual skills and an automatic conversion system, and reports better performance on GUI and visual-centric tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30884","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GUI-C$^2$: Coarse-to-Fine GUI Grounding via Difficulty-Aware Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2026-05-29T06:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GUI-C² pairs a difficulty-scoring data pipeline with an area-gated coarse-to-fine RL mechanism to improve GUI grounding accuracy and training stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16883","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SE-GA: Memory-Augmented Self-Evolution for GUI Agents","primary_cat":"cs.LG","submitted_at":"2026-05-16T08:51:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SE-GA combines Test-Time Memory Extension for dynamic context retrieval with Memory-Augmented Self-Evolution training to reach 89.0% on ScreenSpot and 75.8% on AndroidControl-High.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14311","ref_index":81,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Binary: Reframing GUI Critique as Continuous Semantic Alignment","primary_cat":"cs.LG","submitted_at":"2026-05-14T03:23:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BBCritic reframes GUI critique as continuous semantic alignment via contrastive learning in an affordance space, outperforming larger binary SOTA models on a new four-level hierarchical benchmark without extra annotations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12501","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Covering Human Action Space for Computer Use: Data Synthesis and Benchmark","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents CUActSpot benchmark and renderer-LLM data synthesis that lets a 4B model outperform larger open-source models on complex computer interactions.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Phi-Ground-4B-16C†[12] 2025-07 38.0 24.5 13.5 5.3 6.2 6.2 4.7 2.4 5.0 Uground-V1-2B∗[10] 2024-10 27.1 12.8 14.3 10.5 0.0 9.4 6.2 0.0 5.2 Uground-V1-7B∗[10] 2024-10 31.1 12.9 18.2 18.4 0.0 3.1 9.4 2.4 6.7 OS-Atlas-Base-7B∗[11] 2024-10 18.9 9.0 9.9 15.8 0.0 12.5 10.9 0.0 7.8 InfiGUI-R1-3B [18] 2025-04 45.2 22.0 23.2 23.7 3.1 9.4 7.8 0.0 8.8 UI-Venus-Ground-7B [19] 2025-08 50.8 26.5 24.3 23.7 3.1 18.8 9.4 0.0 11.0 GUI-G2-7B [20] 2025-07 47.5 26.4 21.1 23.7 6.2 15.6 7.8 4.8 11.6 MAI-UI-2B†[22] 2025-12 57.4 30.3 27.1 18.4 3.1 18.8 12.5 9.5 12.5 GUI-Owl-1.5-8B-Think [23] 2026-02 57.6 33.2 24.4 23.7 9.4 18.8 10.9 7.1 14.0 MAI-UI-8B†[22] 2025-12 65.8 40.7 25.1 26.3 18.8 18.8 7.8 4.8 15.3 GUI-Owl-1.5-8B-Instruct [23] 2026-02 71."},{"citing_arxiv_id":"2605.12549","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Happens Before Decoding? Prefill Determines GUI Grounding in VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-10T07:04:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GUI grounding in VLMs is bottlenecked by prefill-stage candidate selection that decoding cannot fix, so Re-Prefill uses attention to extract and re-inject target tokens for up to 4.3% gains on ScreenSpot-Pro.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02630","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AutoFocus: Uncertainty-Aware Active Visual Search for GUI Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-04T14:18:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AutoFocus converts token perplexity into an anisotropic Gaussian uncertainty field to drive region proposals and shape-aware zooming for improved GUI grounding in VLMs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"2, where we analyze the relationship between token-level perplexity and grounding correctness across multiple backbone models. As shown, incor- rect predictions consistently exhibit significantly higher axial perplexity than correct ones, with clear distributional separation in both histogram and box- plot statistics. This trend holds across general-purpose (Qwen2.5-VL [1]) and GUI-specialized models (UI-Venus [8], GTA1 [37]), indicating that perplexity serves as a reliable intrinsic proxy for spatial uncertainty. These findings moti- vate a principled refinement strategy: convert token-level perplexity into explicit spatial variance and guide adaptive visual search accordingly. Specifically,AutoFocusfirstdetermineswhetherrefinementisnecessarybased on the model's initial prediction and its associated uncertainty."},{"citing_arxiv_id":"2604.24348","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OS-SPEAR: A Toolkit for the Safety, Performance,Efficiency, and Robustness Analysis of OS Agents","primary_cat":"cs.CL","submitted_at":"2026-04-27T11:44:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OS-SPEAR is a new evaluation toolkit that tests 22 OS agents and identifies trade-offs between efficiency and safety or robustness.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":", smartphones, computers, tablets, and smart cockpits). OS agents primarily interact with the underlying system through GUIs to achieve end-to-end task automation. From the perspective of OS agent construction, OS agents can be divided into two categories based on the number of agents constructed: single-agent and multi-agent systems. For single-agent [19], [20], [45], during the pre-training phase, the OS agent acquires the ability to understand and 3 TABLE I COMPARISON OFOS-SPEARWITH DIFFERENT BENCHMARKS. OS-SPEARPROVIDES MORE COMPREHENSIVE EVALUATION DIMENSIONS AND OFFERS A TOOL FOR GENERATING ASSESSMENT REPORTS. Representative Works Safety Performance Efficiency Robustness Assessment Report [19], [44], [49], [57]-[76]✗ ✓ ✗ ✗ ✗"},{"citing_arxiv_id":"2604.22558","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SOLAR-RL: Semi-Online Long-horizon Assignment Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-24T13:53:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOLAR-RL assigns dense step-level rewards from static trajectory data by detecting first failure points and applying target-aligned shaping to improve long-horizon GUI task completion without full online interactions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20796","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLaDA2.0-Uni: Unifying Multimodal Understanding and Generation with Diffusion Large Language Model","primary_cat":"cs.CV","submitted_at":"2026-04-22T17:20:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLaDA2.0-Uni unifies multimodal understanding and generation inside one discrete diffusion large language model with a semantic tokenizer, MoE backbone, and diffusion decoder.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14113","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UI-Zoomer: Uncertainty-Driven Adaptive Zoom-In for GUI Grounding","primary_cat":"cs.CV","submitted_at":"2026-04-15T17:32:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UI-Zoomer uses uncertainty quantification to trigger and size adaptive zoom-ins only on uncertain GUI grounding predictions, yielding up to 13.4% gains on benchmarks with no training.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"75N⌋candidates whose centers lie closest to the median center ˜z: di =∥zi−˜z∥2,K= arg topK i {−di},(6) wherez i denotes the center ofbi. We compute subsequent statistics overK. 3.4.2 Variance decomposition. We model the unknown target locationZas a latent random variable and apply the law of total variance coordinate-wise: Var(Z) = Var(E[Z|I])   vinter +E[Var(Z|I)]   vintra .(7) The inter-sample term captures positional disagreement across draws: vinter = 1 K ∑ i∈K (zi−µ)⊙2,µ= 1 K ∑ i∈K zi.(8) The intra-sample term encodes the predicted scale of each element. Treating each box as a Gaussian spanning ±2σof its width and height: vintra = 1 K ∑ i∈K (si 4 )⊙2 ,(9) wheres i = [six,s iy]⊤is the width and height ofbi. The two terms are complementary:vinter expands the"},{"citing_arxiv_id":"2604.13035","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SceneCritic: A Symbolic Evaluator for 3D Indoor Scene Synthesis","primary_cat":"cs.CV","submitted_at":"2026-04-14T17:59:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SceneCritic is a symbolic, ontology-grounded evaluator for floor-plan layouts that identifies specific semantic, orientation, and geometric violations and aligns better with human judgments than VLM-based scorers.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Cdesc is a natural language scene description,Crange defines the spatial boundaries, andCobjects lists 9 Table 3|Models categorized by post-training strategy and model parameters. Model Category Post-Training Details Params Qwen3-14B [39] General RL GRPO-style reinforcement learning 14B Qwen3-235B [39] General RL GRPO-style reinforcement learning 235B UI-Venus-Navi-72B [17] General RL GRPO-based reasoning optimization 72B Gemini-2.5-flash [9] RLAIF + RLHF SFT + Reward Model + RL N/A Qwen2.5-VL-7B-MM-RLHF [13] RLHF PPO-style human feedback alignment 7B Qwen2.5-72B-VL [2] RLHF SFT + DPO (preference optimization) 72B LLaMA4 Maverick [1] RLHF SFT + Online RL + DPO∼17B active (MoE) Qwen3-14B-Intuitor-MATH-1EPOCH [45] RLIF Iterative feedback RL (Intuitor) 14B"},{"citing_arxiv_id":"2604.07831","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Are GUI Agents Focused Enough? Automated Distraction via Semantic-level UI Element Injection","primary_cat":"cs.CR","submitted_at":"2026-04-09T05:32:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Semantic-level UI Element Injection distracts GUI agents by overlaying safety-aligned UI elements, achieving up to 4.4x higher attack success rates that transfer across models and create persistent attractors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.26041","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethinking Token Pruning for Historical Screenshots in GUI Visual Agents: Semantic, Spatial, and Temporal Perspectives","primary_cat":"cs.CV","submitted_at":"2026-03-27T03:21:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical study finds background semantics, random pruning, and recency-based allocation improve token efficiency for GUI visual agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.11724","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WebTestPilot: Agentic End-to-End Web Testing against Natural Language Specification by Inferring Oracles with Symbolized GUI Elements","primary_cat":"cs.SE","submitted_at":"2026-02-12T08:51:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WebTestPilot symbolizes GUI elements to infer contextual oracles for end-to-end web testing from natural language specs, reporting 99% task completion and 96% precision/recall on a new bug-injected benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.10371","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentProg: Empowering Long-Horizon GUI Agents with Program-Guided Context Management","primary_cat":"cs.AI","submitted_at":"2025-12-11T07:37:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AgentProg reframes interaction history as a program with variables and control flow, plus a belief state for partial observability, achieving SOTA success rates on long-horizon GUI benchmarks while baselines degrade.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.21982","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RISK: A Framework for GUI Agents in E-commerce Risk Management","primary_cat":"cs.AI","submitted_at":"2025-09-26T07:05:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RISK introduces a dataset, benchmark, and R1-style RL fine-tuning for GUI agents that achieve 6.8-8.8% offline gains and 70.5% online task success in e-commerce risk management using 7.2% of baseline parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.07553","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VeriOS: Query-Driven Proactive Human-Agent-GUI Interaction for Trustworthy OS Agents","primary_cat":"cs.CL","submitted_at":"2025-09-09T09:46:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VeriOS-Agent is an OS agent that proactively queries humans in untrustworthy scenarios via a query-driven framework and three-stage training, achieving 19.72% higher step-wise success rate over baselines while preserving normal performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}