{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:JD2ZQ3EIWO2MYOVWEQJMBIJN7O","short_pith_number":"pith:JD2ZQ3EI","canonical_record":{"source":{"id":"2401.13649","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-01-24T18:35:21Z","cross_cats_sorted":["cs.CL","cs.CV"],"title_canon_sha256":"e72169dc7b8a326afcf8786f234787d837b6ecd811d6f82b47c1099b40105905","abstract_canon_sha256":"09a974b7f3b516863a9fc0ccfb802d41251178a15031c78910b671d935ac6d7f"},"schema_version":"1.0"},"canonical_sha256":"48f5986c88b3b4cc3ab62412c0a12dfb879cad22d6d6ea688bd1aba900c7a54c","source":{"kind":"arxiv","id":"2401.13649","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.13649","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2401.13649v2","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.13649","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"JD2ZQ3EIWO2M","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"JD2ZQ3EIWO2MYOVW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"JD2ZQ3EI","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:JD2ZQ3EIWO2MYOVWEQJMBIJN7O","target":"record","payload":{"canonical_record":{"source":{"id":"2401.13649","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-01-24T18:35:21Z","cross_cats_sorted":["cs.CL","cs.CV"],"title_canon_sha256":"e72169dc7b8a326afcf8786f234787d837b6ecd811d6f82b47c1099b40105905","abstract_canon_sha256":"09a974b7f3b516863a9fc0ccfb802d41251178a15031c78910b671d935ac6d7f"},"schema_version":"1.0"},"canonical_sha256":"48f5986c88b3b4cc3ab62412c0a12dfb879cad22d6d6ea688bd1aba900c7a54c","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.707465Z","signature_b64":"dsIQk03hyfHBS6Aocmq1iBX1SCnCkgUFYGPva3va5FWg1PTc3276Be3URFE7HSW3LQqp0zMuFxF45VOZmM4YBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"48f5986c88b3b4cc3ab62412c0a12dfb879cad22d6d6ea688bd1aba900c7a54c","last_reissued_at":"2026-05-17T23:38:13.706760Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.706760Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2401.13649","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pvRXjuxtU47H/oYhgwrqfXGSasroWPfODDfo9fJhPncieyEJZgxp2R3+c73pRPbJGf0mKiCJUJKiMKyS5I5vBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T01:12:39.129036Z"},"content_sha256":"56ef9ba31f581f868c942f339174259ff4bc6112e6ead812e3a0d9f1c5ddd9c8","schema_version":"1.0","event_id":"sha256:56ef9ba31f581f868c942f339174259ff4bc6112e6ead812e3a0d9f1c5ddd9c8"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:JD2ZQ3EIWO2MYOVWEQJMBIJN7O","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web Tasks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"VisualWebArena shows that multimodal agents still struggle with visually grounded web tasks.","cross_cats":["cs.CL","cs.CV"],"primary_cat":"cs.LG","authors_text":"Daniel Fried, Graham Neubig, Jing Yu Koh, Lawrence Jang, Ming Chong Lim, Po-Yu Huang, Robert Lo, Ruslan Salakhutdinov, Shuyan Zhou, Vikram Duvvur","submitted_at":"2024-01-24T18:35:21Z","abstract_excerpt":"Autonomous agents capable of planning, reasoning, and executing actions on the web offer a promising avenue for automating computer tasks. However, the majority of existing benchmarks primarily focus on text-based agents, neglecting many natural tasks that require visual information to effectively solve. Given that most computer interfaces cater to human perception, visual information often augments textual data in ways that text-only models struggle to harness effectively. To bridge this gap, we introduce VisualWebArena, a benchmark designed to assess the performance of multimodal web agents "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Through extensive quantitative and qualitative analysis, we identify several limitations of text-only LLM agents, and reveal gaps in the capabilities of state-of-the-art multimodal language agents.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the chosen websites and task templates are sufficiently representative of the visual and interaction challenges encountered in real-world web use.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"VisualWebArena benchmark demonstrates that state-of-the-art multimodal agents still exhibit significant limitations on visually grounded web tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"VisualWebArena shows that multimodal agents still struggle with visually grounded web tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b89ccf53e765443181b358931ae80485701319525bc8efbde018b682bd39cbc1"},"source":{"id":"2401.13649","kind":"arxiv","version":2},"verdict":{"id":"2c45d65f-5efe-472d-bece-749dde7a857e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T15:15:32.796807Z","strongest_claim":"Through extensive quantitative and qualitative analysis, we identify several limitations of text-only LLM agents, and reveal gaps in the capabilities of state-of-the-art multimodal language agents.","one_line_summary":"VisualWebArena benchmark demonstrates that state-of-the-art multimodal agents still exhibit significant limitations on visually grounded web tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the chosen websites and task templates are sufficiently representative of the visual and interaction challenges encountered in real-world web use.","pith_extraction_headline":"VisualWebArena shows that multimodal agents still struggle with visually grounded web tasks."},"references":{"count":26,"sample":[{"doi":"","year":null,"title":"Scaling Instruction-Finetuned Language Models","work_id":"8405abb1-7558-4fdf-af24-f4c52fa77a06","ref_index":1,"cited_arxiv_id":"2210.11416","is_internal_anchor":true},{"doi":"","year":1996,"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","ref_index":2,"cited_arxiv_id":"2312.11805","is_internal_anchor":true},{"doi":"","year":null,"title":"Language models can solve computer tasks. NeurIPS. Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi","work_id":"de63ec70-7c06-4692-b530-717ead70ef4c","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2014,"title":"Improved Baselines with Visual Instruction Tuning","work_id":"5baeaa33-5986-44a3-85a4-fcabd6fc1e8d","ref_index":4,"cited_arxiv_id":"2310.03744","is_internal_anchor":true},{"doi":"","year":2023,"title":"GAIA: a benchmark for General AI Assistants","work_id":"cf222b33-f7a3-4044-a570-ecfe25edb3f8","ref_index":5,"cited_arxiv_id":"2311.12983","is_internal_anchor":true}],"resolved_work":26,"snapshot_sha256":"f126cd73e6c90be0867058db5995fd496c4fcc8b669627362d7451f1717707df","internal_anchors":5},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d12cf10fc30b389f09f9e2bec561364ef1826a3adee0e8bc033a3971919238af"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"2c45d65f-5efe-472d-bece-749dde7a857e"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"oJYSgIzDw44+6mw3FBsLjVjYu7a9Z/cLRfmCd+LvIpeHuTojux0c7GO5tXGj0HDJ5cMzx5e3PInsnC7kyoILBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T01:12:39.129884Z"},"content_sha256":"ac2896eee2f419304979ddf4b8040cddf333165e4ed0359ac81627ca8848e82d","schema_version":"1.0","event_id":"sha256:ac2896eee2f419304979ddf4b8040cddf333165e4ed0359ac81627ca8848e82d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/JD2ZQ3EIWO2MYOVWEQJMBIJN7O/bundle.json","state_url":"https://pith.science/pith/JD2ZQ3EIWO2MYOVWEQJMBIJN7O/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/JD2ZQ3EIWO2MYOVWEQJMBIJN7O/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-19T01:12:39Z","links":{"resolver":"https://pith.science/pith/JD2ZQ3EIWO2MYOVWEQJMBIJN7O","bundle":"https://pith.science/pith/JD2ZQ3EIWO2MYOVWEQJMBIJN7O/bundle.json","state":"https://pith.science/pith/JD2ZQ3EIWO2MYOVWEQJMBIJN7O/state.json","well_known_bundle":"https://pith.science/.well-known/pith/JD2ZQ3EIWO2MYOVWEQJMBIJN7O/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:JD2ZQ3EIWO2MYOVWEQJMBIJN7O","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"09a974b7f3b516863a9fc0ccfb802d41251178a15031c78910b671d935ac6d7f","cross_cats_sorted":["cs.CL","cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-01-24T18:35:21Z","title_canon_sha256":"e72169dc7b8a326afcf8786f234787d837b6ecd811d6f82b47c1099b40105905"},"schema_version":"1.0","source":{"id":"2401.13649","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.13649","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2401.13649v2","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.13649","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"JD2ZQ3EIWO2M","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"JD2ZQ3EIWO2MYOVW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"JD2ZQ3EI","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:ac2896eee2f419304979ddf4b8040cddf333165e4ed0359ac81627ca8848e82d","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Through extensive quantitative and qualitative analysis, we identify several limitations of text-only LLM agents, and reveal gaps in the capabilities of state-of-the-art multimodal language agents."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the chosen websites and task templates are sufficiently representative of the visual and interaction challenges encountered in real-world web use."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"VisualWebArena benchmark demonstrates that state-of-the-art multimodal agents still exhibit significant limitations on visually grounded web tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"VisualWebArena shows that multimodal agents still struggle with visually grounded web tasks."}],"snapshot_sha256":"b89ccf53e765443181b358931ae80485701319525bc8efbde018b682bd39cbc1"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d12cf10fc30b389f09f9e2bec561364ef1826a3adee0e8bc033a3971919238af"},"paper":{"abstract_excerpt":"Autonomous agents capable of planning, reasoning, and executing actions on the web offer a promising avenue for automating computer tasks. However, the majority of existing benchmarks primarily focus on text-based agents, neglecting many natural tasks that require visual information to effectively solve. Given that most computer interfaces cater to human perception, visual information often augments textual data in ways that text-only models struggle to harness effectively. To bridge this gap, we introduce VisualWebArena, a benchmark designed to assess the performance of multimodal web agents ","authors_text":"Daniel Fried, Graham Neubig, Jing Yu Koh, Lawrence Jang, Ming Chong Lim, Po-Yu Huang, Robert Lo, Ruslan Salakhutdinov, Shuyan Zhou, Vikram Duvvur","cross_cats":["cs.CL","cs.CV"],"headline":"VisualWebArena shows that multimodal agents still struggle with visually grounded web tasks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-01-24T18:35:21Z","title":"VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web Tasks"},"references":{"count":26,"internal_anchors":5,"resolved_work":26,"sample":[{"cited_arxiv_id":"2210.11416","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Scaling Instruction-Finetuned Language Models","work_id":"8405abb1-7558-4fdf-af24-f4c52fa77a06","year":null},{"cited_arxiv_id":"2312.11805","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","year":1996},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Language models can solve computer tasks. NeurIPS. Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi","work_id":"de63ec70-7c06-4692-b530-717ead70ef4c","year":null},{"cited_arxiv_id":"2310.03744","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Improved Baselines with Visual Instruction Tuning","work_id":"5baeaa33-5986-44a3-85a4-fcabd6fc1e8d","year":2014},{"cited_arxiv_id":"2311.12983","doi":"","is_internal_anchor":true,"ref_index":5,"title":"GAIA: a benchmark for General AI Assistants","work_id":"cf222b33-f7a3-4044-a570-ecfe25edb3f8","year":2023}],"snapshot_sha256":"f126cd73e6c90be0867058db5995fd496c4fcc8b669627362d7451f1717707df"},"source":{"id":"2401.13649","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T15:15:32.796807Z","id":"2c45d65f-5efe-472d-bece-749dde7a857e","model_set":{"reader":"grok-4.3"},"one_line_summary":"VisualWebArena benchmark demonstrates that state-of-the-art multimodal agents still exhibit significant limitations on visually grounded web tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"VisualWebArena shows that multimodal agents still struggle with visually grounded web tasks.","strongest_claim":"Through extensive quantitative and qualitative analysis, we identify several limitations of text-only LLM agents, and reveal gaps in the capabilities of state-of-the-art multimodal language agents.","weakest_assumption":"That the chosen websites and task templates are sufficiently representative of the visual and interaction challenges encountered in real-world web use."}},"verdict_id":"2c45d65f-5efe-472d-bece-749dde7a857e"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:56ef9ba31f581f868c942f339174259ff4bc6112e6ead812e3a0d9f1c5ddd9c8","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"09a974b7f3b516863a9fc0ccfb802d41251178a15031c78910b671d935ac6d7f","cross_cats_sorted":["cs.CL","cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-01-24T18:35:21Z","title_canon_sha256":"e72169dc7b8a326afcf8786f234787d837b6ecd811d6f82b47c1099b40105905"},"schema_version":"1.0","source":{"id":"2401.13649","kind":"arxiv","version":2}},"canonical_sha256":"48f5986c88b3b4cc3ab62412c0a12dfb879cad22d6d6ea688bd1aba900c7a54c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"48f5986c88b3b4cc3ab62412c0a12dfb879cad22d6d6ea688bd1aba900c7a54c","first_computed_at":"2026-05-17T23:38:13.706760Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.706760Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"dsIQk03hyfHBS6Aocmq1iBX1SCnCkgUFYGPva3va5FWg1PTc3276Be3URFE7HSW3LQqp0zMuFxF45VOZmM4YBw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.707465Z","signed_message":"canonical_sha256_bytes"},"source_id":"2401.13649","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:56ef9ba31f581f868c942f339174259ff4bc6112e6ead812e3a0d9f1c5ddd9c8","sha256:ac2896eee2f419304979ddf4b8040cddf333165e4ed0359ac81627ca8848e82d"],"state_sha256":"598411dfdc4e95c65b35716008cc54a2f3b68840ddb38d8fb9c4caf4b7ff5d20"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Prj2cbYuOun7FoQEBtMWTVNeC56rcx1Nw8sJiivg68pDRKL9fvEIageNQmaPklfrI4kn6i3a/CYoDt5c5iVaDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-19T01:12:39.132355Z","bundle_sha256":"82f0b4376209868b9154146a59b9f28faf02ec8bc545d95e2b6e153a231ac189"}}