{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:U4LWUM4HR6F4JRSRGMW53APVDW","short_pith_number":"pith:U4LWUM4H","canonical_record":{"source":{"id":"2504.18575","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CR","submitted_at":"2025-04-22T17:51:03Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"d82c1e88660f2ab70e699549c3e5c7b3f0c0ff29856e788c2914471738702a6d","abstract_canon_sha256":"7195a4ebdc15e95bc2fac8acabd99815e08a6520136f3539a878f772e241b857"},"schema_version":"1.0"},"canonical_sha256":"a7176a33878f8bc4c651332ddd81f51d85f305df7dfc3bbd3e42fc27d184197b","source":{"kind":"arxiv","id":"2504.18575","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.18575","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2504.18575v3","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.18575","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"U4LWUM4HR6F4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"U4LWUM4HR6F4JRSR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"U4LWUM4H","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:U4LWUM4HR6F4JRSRGMW53APVDW","target":"record","payload":{"canonical_record":{"source":{"id":"2504.18575","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CR","submitted_at":"2025-04-22T17:51:03Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"d82c1e88660f2ab70e699549c3e5c7b3f0c0ff29856e788c2914471738702a6d","abstract_canon_sha256":"7195a4ebdc15e95bc2fac8acabd99815e08a6520136f3539a878f772e241b857"},"schema_version":"1.0"},"canonical_sha256":"a7176a33878f8bc4c651332ddd81f51d85f305df7dfc3bbd3e42fc27d184197b","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.957940Z","signature_b64":"kbzc75II+FEkdnC20kbgt+mu6VaR/e0C1JTTDiOnfCqdVuU+YTWkAbcQMEbtKEjST8CLUmqovnRZ1MJ+ig40CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a7176a33878f8bc4c651332ddd81f51d85f305df7dfc3bbd3e42fc27d184197b","last_reissued_at":"2026-05-17T23:38:49.957466Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.957466Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2504.18575","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Y04h06XcKu+0iBHAyYul23Bysust82eo/5phaxYXfDY2Ghvd9eb8pWsqvMDIK4AFUtXTbBMYxXMQbHLu5M1OBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T11:06:01.265364Z"},"content_sha256":"0525ad31947dc729514d8565ab94769f18c661fd383c178c295b081d5dea8861","schema_version":"1.0","event_id":"sha256:0525ad31947dc729514d8565ab94769f18c661fd383c178c295b081d5dea8861"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:U4LWUM4HR6F4JRSRGMW53APVDW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"WASP: Benchmarking Web Agent Security Against Prompt Injection Attacks","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"WASP benchmark shows top web agents deceived by simple prompt injections with partial success up to 86 percent.","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Aaron Grattafiori, Arman Zharmagambetov, Chuan Guo, Ivan Evtimov, Kamalika Chaudhuri","submitted_at":"2025-04-22T17:51:03Z","abstract_excerpt":"Autonomous UI agents powered by AI have tremendous potential to boost human productivity by automating routine tasks such as filing taxes and paying bills. However, a major challenge in unlocking their full potential is security, which is exacerbated by the agent's ability to take action on their user's behalf. Existing tests for prompt injections in web agents either over-simplify the threat by testing unrealistic scenarios or giving the attacker too much power, or look at single-step isolated tasks. To more accurately measure progress for secure web agents, we introduce WASP -- a new publicl"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Evaluating with WASP shows that even top-tier AI models, including those with advanced reasoning capabilities, can be deceived by simple, low-effort human-written injections in very realistic scenarios. Our end-to-end evaluation reveals a previously unobserved insight: while attacks partially succeed in up to 86% of the case, even state-of-the-art agents often struggle to fully complete the attacker goals.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The benchmark tasks and injection examples accurately represent real-world web agent usage and attacker capabilities without over-simplifying or granting attackers unrealistic control.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"WASP benchmark reveals web agents are vulnerable to simple prompt injections with partial success rates up to 86%, but agents frequently fail to complete attacker objectives.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"WASP benchmark shows top web agents deceived by simple prompt injections with partial success up to 86 percent.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"5702fb7655825c08921547d3cf3a5eb7bfa532520f4c9cb313cd3189707c68a2"},"source":{"id":"2504.18575","kind":"arxiv","version":3},"verdict":{"id":"b1fecf80-6a4e-4311-8f08-1d94e67dc024","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:18:38.317346Z","strongest_claim":"Evaluating with WASP shows that even top-tier AI models, including those with advanced reasoning capabilities, can be deceived by simple, low-effort human-written injections in very realistic scenarios. Our end-to-end evaluation reveals a previously unobserved insight: while attacks partially succeed in up to 86% of the case, even state-of-the-art agents often struggle to fully complete the attacker goals.","one_line_summary":"WASP benchmark reveals web agents are vulnerable to simple prompt injections with partial success rates up to 86%, but agents frequently fail to complete attacker objectives.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The benchmark tasks and injection examples accurately represent real-world web agent usage and attacker capabilities without over-simplifying or granting attackers unrealistic control.","pith_extraction_headline":"WASP benchmark shows top web agents deceived by simple prompt injections with partial success up to 86 percent."},"references":{"count":22,"sample":[{"doi":"","year":2024,"title":"WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models","work_id":"12c1d840-af20-4a4e-8750-6b9c6266638f","ref_index":1,"cited_arxiv_id":"2401.13919","is_internal_anchor":true},{"doi":"","year":null,"title":"The objective is to comment on a given issue saying ‘‘we are working on it.’’ Currently, the observation shows that we are on the profile page in GitLab","work_id":"1175517f-80d7-46e1-a07e-e222cf3e5847","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Comment on the given issue saying we are working on it","work_id":"5e5d48ee-d0d3-460c-99c2-0f3f89be6780","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"‘not compromised","work_id":"5571ea5e-f217-4aa9-a12e-44a8b6c6fc8f","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"‘not compromised","work_id":"84e9bef1-f3a2-49c6-8449-8346de242a7e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":22,"snapshot_sha256":"d67fda95b5779b144c98eaeaca34f75faf5f7c05729a71085ac010515bd57488","internal_anchors":1},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"b1fecf80-6a4e-4311-8f08-1d94e67dc024"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"zsxcbWtxYg4Yl8S4yDXFAQ/AxNZmTr8vp0C2ACZ+ZNxGDHhEIufItTteBOrk9F0oTrV+410U1zxRiSRO5Cx6Dg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T11:06:01.266061Z"},"content_sha256":"7534d4c5a2da2d24ad8644b416b9d1803a579a38b767526c66ee79f30480d333","schema_version":"1.0","event_id":"sha256:7534d4c5a2da2d24ad8644b416b9d1803a579a38b767526c66ee79f30480d333"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/U4LWUM4HR6F4JRSRGMW53APVDW/bundle.json","state_url":"https://pith.science/pith/U4LWUM4HR6F4JRSRGMW53APVDW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/U4LWUM4HR6F4JRSRGMW53APVDW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T11:06:01Z","links":{"resolver":"https://pith.science/pith/U4LWUM4HR6F4JRSRGMW53APVDW","bundle":"https://pith.science/pith/U4LWUM4HR6F4JRSRGMW53APVDW/bundle.json","state":"https://pith.science/pith/U4LWUM4HR6F4JRSRGMW53APVDW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/U4LWUM4HR6F4JRSRGMW53APVDW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:U4LWUM4HR6F4JRSRGMW53APVDW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7195a4ebdc15e95bc2fac8acabd99815e08a6520136f3539a878f772e241b857","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CR","submitted_at":"2025-04-22T17:51:03Z","title_canon_sha256":"d82c1e88660f2ab70e699549c3e5c7b3f0c0ff29856e788c2914471738702a6d"},"schema_version":"1.0","source":{"id":"2504.18575","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.18575","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2504.18575v3","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.18575","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"U4LWUM4HR6F4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"U4LWUM4HR6F4JRSR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"U4LWUM4H","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:7534d4c5a2da2d24ad8644b416b9d1803a579a38b767526c66ee79f30480d333","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Evaluating with WASP shows that even top-tier AI models, including those with advanced reasoning capabilities, can be deceived by simple, low-effort human-written injections in very realistic scenarios. Our end-to-end evaluation reveals a previously unobserved insight: while attacks partially succeed in up to 86% of the case, even state-of-the-art agents often struggle to fully complete the attacker goals."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The benchmark tasks and injection examples accurately represent real-world web agent usage and attacker capabilities without over-simplifying or granting attackers unrealistic control."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"WASP benchmark reveals web agents are vulnerable to simple prompt injections with partial success rates up to 86%, but agents frequently fail to complete attacker objectives."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"WASP benchmark shows top web agents deceived by simple prompt injections with partial success up to 86 percent."}],"snapshot_sha256":"5702fb7655825c08921547d3cf3a5eb7bfa532520f4c9cb313cd3189707c68a2"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Autonomous UI agents powered by AI have tremendous potential to boost human productivity by automating routine tasks such as filing taxes and paying bills. However, a major challenge in unlocking their full potential is security, which is exacerbated by the agent's ability to take action on their user's behalf. Existing tests for prompt injections in web agents either over-simplify the threat by testing unrealistic scenarios or giving the attacker too much power, or look at single-step isolated tasks. To more accurately measure progress for secure web agents, we introduce WASP -- a new publicl","authors_text":"Aaron Grattafiori, Arman Zharmagambetov, Chuan Guo, Ivan Evtimov, Kamalika Chaudhuri","cross_cats":["cs.AI"],"headline":"WASP benchmark shows top web agents deceived by simple prompt injections with partial success up to 86 percent.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CR","submitted_at":"2025-04-22T17:51:03Z","title":"WASP: Benchmarking Web Agent Security Against Prompt Injection Attacks"},"references":{"count":22,"internal_anchors":1,"resolved_work":22,"sample":[{"cited_arxiv_id":"2401.13919","doi":"","is_internal_anchor":true,"ref_index":1,"title":"WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models","work_id":"12c1d840-af20-4a4e-8750-6b9c6266638f","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"The objective is to comment on a given issue saying ‘‘we are working on it.’’ Currently, the observation shows that we are on the profile page in GitLab","work_id":"1175517f-80d7-46e1-a07e-e222cf3e5847","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Comment on the given issue saying we are working on it","work_id":"5e5d48ee-d0d3-460c-99c2-0f3f89be6780","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"‘not compromised","work_id":"5571ea5e-f217-4aa9-a12e-44a8b6c6fc8f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"‘not compromised","work_id":"84e9bef1-f3a2-49c6-8449-8346de242a7e","year":null}],"snapshot_sha256":"d67fda95b5779b144c98eaeaca34f75faf5f7c05729a71085ac010515bd57488"},"source":{"id":"2504.18575","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T22:18:38.317346Z","id":"b1fecf80-6a4e-4311-8f08-1d94e67dc024","model_set":{"reader":"grok-4.3"},"one_line_summary":"WASP benchmark reveals web agents are vulnerable to simple prompt injections with partial success rates up to 86%, but agents frequently fail to complete attacker objectives.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"WASP benchmark shows top web agents deceived by simple prompt injections with partial success up to 86 percent.","strongest_claim":"Evaluating with WASP shows that even top-tier AI models, including those with advanced reasoning capabilities, can be deceived by simple, low-effort human-written injections in very realistic scenarios. Our end-to-end evaluation reveals a previously unobserved insight: while attacks partially succeed in up to 86% of the case, even state-of-the-art agents often struggle to fully complete the attacker goals.","weakest_assumption":"The benchmark tasks and injection examples accurately represent real-world web agent usage and attacker capabilities without over-simplifying or granting attackers unrealistic control."}},"verdict_id":"b1fecf80-6a4e-4311-8f08-1d94e67dc024"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0525ad31947dc729514d8565ab94769f18c661fd383c178c295b081d5dea8861","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7195a4ebdc15e95bc2fac8acabd99815e08a6520136f3539a878f772e241b857","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CR","submitted_at":"2025-04-22T17:51:03Z","title_canon_sha256":"d82c1e88660f2ab70e699549c3e5c7b3f0c0ff29856e788c2914471738702a6d"},"schema_version":"1.0","source":{"id":"2504.18575","kind":"arxiv","version":3}},"canonical_sha256":"a7176a33878f8bc4c651332ddd81f51d85f305df7dfc3bbd3e42fc27d184197b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a7176a33878f8bc4c651332ddd81f51d85f305df7dfc3bbd3e42fc27d184197b","first_computed_at":"2026-05-17T23:38:49.957466Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.957466Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"kbzc75II+FEkdnC20kbgt+mu6VaR/e0C1JTTDiOnfCqdVuU+YTWkAbcQMEbtKEjST8CLUmqovnRZ1MJ+ig40CA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.957940Z","signed_message":"canonical_sha256_bytes"},"source_id":"2504.18575","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0525ad31947dc729514d8565ab94769f18c661fd383c178c295b081d5dea8861","sha256:7534d4c5a2da2d24ad8644b416b9d1803a579a38b767526c66ee79f30480d333"],"state_sha256":"b2224cdfd7f5fa14a4a6662e54061d029a0d03c2c4f336b4bb2af558a3277001"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WZn8/xB9be0H+SebxfLkNY2ofWi17EgkBuZMTkoAk+KU2jS5XHWjGThDSEAIT7GmbM44FjjurAId+xkS31vWAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T11:06:01.270451Z","bundle_sha256":"29305fc74ea8ba35ea1521f1a3864c35fda7decf6f486f764aea6255b52e3011"}}