{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:Y4CLHGPBMYXO2I43ZRXH3JHZIW","short_pith_number":"pith:Y4CLHGPB","canonical_record":{"source":{"id":"2401.16158","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-29T13:46:37Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"5407557842b8ba6a148ad7928ff7cadb44b03a92568e5c205a0cb85d50c4bb59","abstract_canon_sha256":"5504b8dbd1e360bf1b287603693ad18ad6df3cca39f17125188ae2229f9375cc"},"schema_version":"1.0"},"canonical_sha256":"c704b399e1662eed239bcc6e7da4f945b172336b19d0aba74b44dc1737aaad43","source":{"kind":"arxiv","id":"2401.16158","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.16158","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2401.16158v2","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.16158","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"Y4CLHGPBMYXO","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"Y4CLHGPBMYXO2I43","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"Y4CLHGPB","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:Y4CLHGPBMYXO2I43ZRXH3JHZIW","target":"record","payload":{"canonical_record":{"source":{"id":"2401.16158","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-29T13:46:37Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"5407557842b8ba6a148ad7928ff7cadb44b03a92568e5c205a0cb85d50c4bb59","abstract_canon_sha256":"5504b8dbd1e360bf1b287603693ad18ad6df3cca39f17125188ae2229f9375cc"},"schema_version":"1.0"},"canonical_sha256":"c704b399e1662eed239bcc6e7da4f945b172336b19d0aba74b44dc1737aaad43","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.128501Z","signature_b64":"wpo+yrtGd/rjs2Dpdrkd+mpSj9oSPhwVCoIOc17HRSov0I0H+qnHDXsxGIYj0C9OelZO5y6WV/puxZx0JJwyBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c704b399e1662eed239bcc6e7da4f945b172336b19d0aba74b44dc1737aaad43","last_reissued_at":"2026-05-17T23:38:46.127931Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.127931Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2401.16158","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"bk6VH8q96ArFJSz+PuT68n2ermr9zl9iL32WBa1xqxb50WTV5ih4bzn6v6A+ZHbbqF/M1BdWiL1gE9UjxxFJAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:25:52.222746Z"},"content_sha256":"f2bb783c98314b0237e30689637eb0d0f65e43f3fb66ddfe65a04911b7e2e371","schema_version":"1.0","event_id":"sha256:f2bb783c98314b0237e30689637eb0d0f65e43f3fb66ddfe65a04911b7e2e371"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:Y4CLHGPBMYXO2I43ZRXH3JHZIW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual Perception","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Mobile-Agent operates mobile apps by visually identifying screen elements instead of using system metadata.","cross_cats":["cs.CV"],"primary_cat":"cs.CL","authors_text":"Fei Huang, Haiyang Xu, Jiabo Ye, Jitao Sang, Ji Zhang, Junyang Wang, Ming Yan, Weizhou Shen","submitted_at":"2024-01-29T13:46:37Z","abstract_excerpt":"Mobile device agent based on Multimodal Large Language Models (MLLM) is becoming a popular application. In this paper, we introduce Mobile-Agent, an autonomous multi-modal mobile device agent. Mobile-Agent first leverages visual perception tools to accurately identify and locate both the visual and textual elements within the app's front-end interface. Based on the perceived vision context, it then autonomously plans and decomposes the complex operation task, and navigates the mobile Apps through operations step by step. Different from previous solutions that rely on XML files of Apps or mobil"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Mobile-Agent achieved remarkable accuracy and completion rates. Even with challenging instructions, such as multi-app operations, Mobile-Agent can still complete the requirements.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That visual perception tools can accurately and reliably identify and locate both visual and textual elements within diverse app front-end interfaces across different mobile operating environments without significant errors or the need for system-specific adjustments.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Mobile-Agent is a vision-centric autonomous agent that uses MLLMs to perceive UI elements, plan complex multi-step tasks, and operate mobile apps without relying on XML or system metadata, showing strong results on the introduced Mobile-Eval benchmark.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Mobile-Agent operates mobile apps by visually identifying screen elements instead of using system metadata.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"413527ad113fac4d1f27850618b74ece8b22afa696c8b61534091b8a82977398"},"source":{"id":"2401.16158","kind":"arxiv","version":2},"verdict":{"id":"db266f45-9fc6-4eec-b69b-9cfffcd7a501","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T00:14:45.002300Z","strongest_claim":"Mobile-Agent achieved remarkable accuracy and completion rates. Even with challenging instructions, such as multi-app operations, Mobile-Agent can still complete the requirements.","one_line_summary":"Mobile-Agent is a vision-centric autonomous agent that uses MLLMs to perceive UI elements, plan complex multi-step tasks, and operate mobile apps without relying on XML or system metadata, showing strong results on the introduced Mobile-Eval benchmark.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That visual perception tools can accurately and reliably identify and locate both visual and textual elements within diverse app front-end interfaces across different mobile operating environments without significant errors or the need for system-specific adjustments.","pith_extraction_headline":"Mobile-Agent operates mobile apps by visually identifying screen elements instead of using system metadata."},"references":{"count":13,"sample":[{"doi":"","year":null,"title":"Modelscope-agent: Building your customizable agent system with open-source large language models","work_id":"2327fb2d-530b-44f6-972c-1ea9cb6b8c3d","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Controlllm: Augment language models with tools by searching on graphs","work_id":"5a9fd1ba-c4e9-4185-ad1e-5e587998b78a","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models","work_id":"b06ebfb8-5543-4f2c-af49-c05c4e63fc45","ref_index":3,"cited_arxiv_id":"2303.04671","is_internal_anchor":true},{"doi":"","year":null,"title":"Gpt4tools: Teaching large lan- guage model to use tools via self-instruction","work_id":"260a71e5-f66b-4679-a4f4-2d3778841d09","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action","work_id":"6dc43db8-227d-438e-8658-0c8acecba08a","ref_index":5,"cited_arxiv_id":"2303.11381","is_internal_anchor":true}],"resolved_work":13,"snapshot_sha256":"dcf915dec29c09442bb834231a4fec65806641cbb60eaec2f24a6467794ec2d3","internal_anchors":7},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b0eccfe4921bae5c5aa763af5155b57745223683a218ecfd6b59535b936a29ea"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"db266f45-9fc6-4eec-b69b-9cfffcd7a501"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"g+22rKuStXnLAqI7D22bbbzSWqzfz8PUcKXB0jdfKauzkAyIvuv1X6UeRohiytthvtzg4Q7BIgxPfzlUXNuUDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:25:52.223597Z"},"content_sha256":"8938de6cc09b3865fdc442fee0b89d18a89ae1eb15edffd8e9b8e87c7a688d63","schema_version":"1.0","event_id":"sha256:8938de6cc09b3865fdc442fee0b89d18a89ae1eb15edffd8e9b8e87c7a688d63"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/Y4CLHGPBMYXO2I43ZRXH3JHZIW/bundle.json","state_url":"https://pith.science/pith/Y4CLHGPBMYXO2I43ZRXH3JHZIW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/Y4CLHGPBMYXO2I43ZRXH3JHZIW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T20:25:52Z","links":{"resolver":"https://pith.science/pith/Y4CLHGPBMYXO2I43ZRXH3JHZIW","bundle":"https://pith.science/pith/Y4CLHGPBMYXO2I43ZRXH3JHZIW/bundle.json","state":"https://pith.science/pith/Y4CLHGPBMYXO2I43ZRXH3JHZIW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/Y4CLHGPBMYXO2I43ZRXH3JHZIW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:Y4CLHGPBMYXO2I43ZRXH3JHZIW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5504b8dbd1e360bf1b287603693ad18ad6df3cca39f17125188ae2229f9375cc","cross_cats_sorted":["cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-29T13:46:37Z","title_canon_sha256":"5407557842b8ba6a148ad7928ff7cadb44b03a92568e5c205a0cb85d50c4bb59"},"schema_version":"1.0","source":{"id":"2401.16158","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.16158","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2401.16158v2","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.16158","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"Y4CLHGPBMYXO","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"Y4CLHGPBMYXO2I43","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"Y4CLHGPB","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8938de6cc09b3865fdc442fee0b89d18a89ae1eb15edffd8e9b8e87c7a688d63","target":"graph","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Mobile-Agent achieved remarkable accuracy and completion rates. Even with challenging instructions, such as multi-app operations, Mobile-Agent can still complete the requirements."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That visual perception tools can accurately and reliably identify and locate both visual and textual elements within diverse app front-end interfaces across different mobile operating environments without significant errors or the need for system-specific adjustments."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Mobile-Agent is a vision-centric autonomous agent that uses MLLMs to perceive UI elements, plan complex multi-step tasks, and operate mobile apps without relying on XML or system metadata, showing strong results on the introduced Mobile-Eval benchmark."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Mobile-Agent operates mobile apps by visually identifying screen elements instead of using system metadata."}],"snapshot_sha256":"413527ad113fac4d1f27850618b74ece8b22afa696c8b61534091b8a82977398"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b0eccfe4921bae5c5aa763af5155b57745223683a218ecfd6b59535b936a29ea"},"paper":{"abstract_excerpt":"Mobile device agent based on Multimodal Large Language Models (MLLM) is becoming a popular application. In this paper, we introduce Mobile-Agent, an autonomous multi-modal mobile device agent. Mobile-Agent first leverages visual perception tools to accurately identify and locate both the visual and textual elements within the app's front-end interface. Based on the perceived vision context, it then autonomously plans and decomposes the complex operation task, and navigates the mobile Apps through operations step by step. Different from previous solutions that rely on XML files of Apps or mobil","authors_text":"Fei Huang, Haiyang Xu, Jiabo Ye, Jitao Sang, Ji Zhang, Junyang Wang, Ming Yan, Weizhou Shen","cross_cats":["cs.CV"],"headline":"Mobile-Agent operates mobile apps by visually identifying screen elements instead of using system metadata.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-29T13:46:37Z","title":"Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual Perception"},"references":{"count":13,"internal_anchors":7,"resolved_work":13,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Modelscope-agent: Building your customizable agent system with open-source large language models","work_id":"2327fb2d-530b-44f6-972c-1ea9cb6b8c3d","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Controlllm: Augment language models with tools by searching on graphs","work_id":"5a9fd1ba-c4e9-4185-ad1e-5e587998b78a","year":null},{"cited_arxiv_id":"2303.04671","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models","work_id":"b06ebfb8-5543-4f2c-af49-c05c4e63fc45","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Gpt4tools: Teaching large lan- guage model to use tools via self-instruction","work_id":"260a71e5-f66b-4679-a4f4-2d3778841d09","year":null},{"cited_arxiv_id":"2303.11381","doi":"","is_internal_anchor":true,"ref_index":5,"title":"MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action","work_id":"6dc43db8-227d-438e-8658-0c8acecba08a","year":null}],"snapshot_sha256":"dcf915dec29c09442bb834231a4fec65806641cbb60eaec2f24a6467794ec2d3"},"source":{"id":"2401.16158","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T00:14:45.002300Z","id":"db266f45-9fc6-4eec-b69b-9cfffcd7a501","model_set":{"reader":"grok-4.3"},"one_line_summary":"Mobile-Agent is a vision-centric autonomous agent that uses MLLMs to perceive UI elements, plan complex multi-step tasks, and operate mobile apps without relying on XML or system metadata, showing strong results on the introduced Mobile-Eval benchmark.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Mobile-Agent operates mobile apps by visually identifying screen elements instead of using system metadata.","strongest_claim":"Mobile-Agent achieved remarkable accuracy and completion rates. Even with challenging instructions, such as multi-app operations, Mobile-Agent can still complete the requirements.","weakest_assumption":"That visual perception tools can accurately and reliably identify and locate both visual and textual elements within diverse app front-end interfaces across different mobile operating environments without significant errors or the need for system-specific adjustments."}},"verdict_id":"db266f45-9fc6-4eec-b69b-9cfffcd7a501"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f2bb783c98314b0237e30689637eb0d0f65e43f3fb66ddfe65a04911b7e2e371","target":"record","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5504b8dbd1e360bf1b287603693ad18ad6df3cca39f17125188ae2229f9375cc","cross_cats_sorted":["cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-01-29T13:46:37Z","title_canon_sha256":"5407557842b8ba6a148ad7928ff7cadb44b03a92568e5c205a0cb85d50c4bb59"},"schema_version":"1.0","source":{"id":"2401.16158","kind":"arxiv","version":2}},"canonical_sha256":"c704b399e1662eed239bcc6e7da4f945b172336b19d0aba74b44dc1737aaad43","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c704b399e1662eed239bcc6e7da4f945b172336b19d0aba74b44dc1737aaad43","first_computed_at":"2026-05-17T23:38:46.127931Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:46.127931Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wpo+yrtGd/rjs2Dpdrkd+mpSj9oSPhwVCoIOc17HRSov0I0H+qnHDXsxGIYj0C9OelZO5y6WV/puxZx0JJwyBw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:46.128501Z","signed_message":"canonical_sha256_bytes"},"source_id":"2401.16158","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f2bb783c98314b0237e30689637eb0d0f65e43f3fb66ddfe65a04911b7e2e371","sha256:8938de6cc09b3865fdc442fee0b89d18a89ae1eb15edffd8e9b8e87c7a688d63"],"state_sha256":"8f7e80bca669cf3fa0429613719aaa2276661e23504efabee7990bbabc50b7ec"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HeBKE2SWdcTyCpsboqQ1kZXKg6tjWwy1w0woXwW3ebs4g5Zj9HcQTSqSIWpa5atE1SOopXoPYwfjKfDEJpkKAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T20:25:52.226073Z","bundle_sha256":"e99288d91ca4bebc43350e1558a0e62d9f0b6b326a79b92b1219577ae69b943a"}}