{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:P5757S2FAQZH6IQQ5IHWWOOFL3","short_pith_number":"pith:P5757S2F","schema_version":"1.0","canonical_sha256":"7f7fdfcb4504327f2210ea0f6b39c55ef4f94485d11d21be6ce5220ca3ce7076","source":{"kind":"arxiv","id":"2606.25760","version":1},"attestation_state":"computed","paper":{"title":"Uncertainty Quantification for Computer-Use Agents: A Benchmark across Vision-Language Models and GUI Grounding Datasets","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.CV"],"primary_cat":"cs.LG","authors_text":"Amanda Sofie Rios, Amit Ranjan Trivedi, Devashri Naik, Divake Kumar, Nilesh Ahuja, Omesh Tickoo, Ranganath Krishnan, Sina Tayebati","submitted_at":"2026-06-24T12:34:28Z","abstract_excerpt":"Computer-use agents turn vision-language model (VLM) predictions into executable GUI clicks, so reliable uncertainty estimates are essential for rejection, calibration, miss-severity ranking, and spatial safety regions. Yet evidence on post-hoc uncertainty quantification (UQ) for these agents is fragmented across isolated model and dataset pairs, leaving it unclear whether UQ rankings stay stable when the agent, benchmark, or observable interface changes. We present Argus, a cross-regime benchmark for post-hoc UQ in single-step executable GUI grounding: a 27-method open-weight matrix over 4 VL"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.25760","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-24T12:34:28Z","cross_cats_sorted":["cs.AI","cs.CL","cs.CV"],"title_canon_sha256":"e319c029fb6a5bf5c8b13c61ac27c3bc79c772050064516e4eab08d295c53d0d","abstract_canon_sha256":"8a1d01a43cde7d12a187907b2cd4222e2d0e3a235ed5790d976949da75c71fa9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-25T01:18:14.529026Z","signature_b64":"A3IwKPYx5ieTOcyIEIZy9OIGXv7KD8Wyp88mGZNlTORxxBMJx8/ACuZMcYk8+XYUywb1Wx+ZEIw8C24HvikcDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7f7fdfcb4504327f2210ea0f6b39c55ef4f94485d11d21be6ce5220ca3ce7076","last_reissued_at":"2026-06-25T01:18:14.528682Z","signature_status":"signed_v1","first_computed_at":"2026-06-25T01:18:14.528682Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Uncertainty Quantification for Computer-Use Agents: A Benchmark across Vision-Language Models and GUI Grounding Datasets","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.CV"],"primary_cat":"cs.LG","authors_text":"Amanda Sofie Rios, Amit Ranjan Trivedi, Devashri Naik, Divake Kumar, Nilesh Ahuja, Omesh Tickoo, Ranganath Krishnan, Sina Tayebati","submitted_at":"2026-06-24T12:34:28Z","abstract_excerpt":"Computer-use agents turn vision-language model (VLM) predictions into executable GUI clicks, so reliable uncertainty estimates are essential for rejection, calibration, miss-severity ranking, and spatial safety regions. Yet evidence on post-hoc uncertainty quantification (UQ) for these agents is fragmented across isolated model and dataset pairs, leaving it unclear whether UQ rankings stay stable when the agent, benchmark, or observable interface changes. We present Argus, a cross-regime benchmark for post-hoc UQ in single-step executable GUI grounding: a 27-method open-weight matrix over 4 VL"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.25760","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.25760/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.25760","created_at":"2026-06-25T01:18:14.528743+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.25760v1","created_at":"2026-06-25T01:18:14.528743+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.25760","created_at":"2026-06-25T01:18:14.528743+00:00"},{"alias_kind":"pith_short_12","alias_value":"P5757S2FAQZH","created_at":"2026-06-25T01:18:14.528743+00:00"},{"alias_kind":"pith_short_16","alias_value":"P5757S2FAQZH6IQQ","created_at":"2026-06-25T01:18:14.528743+00:00"},{"alias_kind":"pith_short_8","alias_value":"P5757S2F","created_at":"2026-06-25T01:18:14.528743+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3","json":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3.json","graph_json":"https://pith.science/api/pith-number/P5757S2FAQZH6IQQ5IHWWOOFL3/graph.json","events_json":"https://pith.science/api/pith-number/P5757S2FAQZH6IQQ5IHWWOOFL3/events.json","paper":"https://pith.science/paper/P5757S2F"},"agent_actions":{"view_html":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3","download_json":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3.json","view_paper":"https://pith.science/paper/P5757S2F","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.25760&json=true","fetch_graph":"https://pith.science/api/pith-number/P5757S2FAQZH6IQQ5IHWWOOFL3/graph.json","fetch_events":"https://pith.science/api/pith-number/P5757S2FAQZH6IQQ5IHWWOOFL3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3/action/storage_attestation","attest_author":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3/action/author_attestation","sign_citation":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3/action/citation_signature","submit_replication":"https://pith.science/pith/P5757S2FAQZH6IQQ5IHWWOOFL3/action/replication_record"}},"created_at":"2026-06-25T01:18:14.528743+00:00","updated_at":"2026-06-25T01:18:14.528743+00:00"}