{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:RNG35ORSVDDFXKASBFKKD72PU4","short_pith_number":"pith:RNG35ORS","schema_version":"1.0","canonical_sha256":"8b4dbeba32a8c65ba8120954a1ff4fa72f16c9718128dd30bed48b3d1e6fa914","source":{"kind":"arxiv","id":"2606.12817","version":1},"attestation_state":"computed","paper":{"title":"Teach-and-Repeat: Accurately Extracting Operational Knowledge from Mobile Screen Demonstrations to Empower GUI Agents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"(2) The Chinese University of Hong Kong, China), Daoyang Liu (2), Hong Kong, Jiawei Liu (1), Lei Hu (1), Ltd, Xingyu Liu (1), Yangfan Luo (1), Yudong Zhang (1), Zhilin Gao (1) ((1) Honor Device Co., Zuojian Wang (1)","submitted_at":"2026-06-11T02:24:39Z","abstract_excerpt":"Understanding the digital world on mobile devices is shifting from static UI perception to dynamic action comprehension. This capability enables models to convert visual state transitions into operational knowledge, defined as short natural-language sentences that describe action types, target UI elements, textual arguments, and execution orders. However, due to the highly diverse and heterogeneous UI designs across applications, existing vision-language models (VLMs) struggle to accurately infer these underlying operations. To bridge this gap, we introduce Teach VLM, a core model designed to "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.12817","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-11T02:24:39Z","cross_cats_sorted":[],"title_canon_sha256":"b49828cd12dfb49b9d6f3904f4270c745505deca945778905ff1e7bcd84f8782","abstract_canon_sha256":"e831f72fa2acf2994e246a699ce81d0f6018469da7d1c008fddd27d5b25cb2d0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-12T01:08:52.569640Z","signature_b64":"Io0yUCliQEl/wFN+mBUEPEyjAxxX2+F/paV26OODNEfmGSZa0cw+OoQgYNCvvnSEoxYX2Do6PNp9jhDOzTSRBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8b4dbeba32a8c65ba8120954a1ff4fa72f16c9718128dd30bed48b3d1e6fa914","last_reissued_at":"2026-06-12T01:08:52.568760Z","signature_status":"signed_v1","first_computed_at":"2026-06-12T01:08:52.568760Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Teach-and-Repeat: Accurately Extracting Operational Knowledge from Mobile Screen Demonstrations to Empower GUI Agents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"(2) The Chinese University of Hong Kong, China), Daoyang Liu (2), Hong Kong, Jiawei Liu (1), Lei Hu (1), Ltd, Xingyu Liu (1), Yangfan Luo (1), Yudong Zhang (1), Zhilin Gao (1) ((1) Honor Device Co., Zuojian Wang (1)","submitted_at":"2026-06-11T02:24:39Z","abstract_excerpt":"Understanding the digital world on mobile devices is shifting from static UI perception to dynamic action comprehension. This capability enables models to convert visual state transitions into operational knowledge, defined as short natural-language sentences that describe action types, target UI elements, textual arguments, and execution orders. However, due to the highly diverse and heterogeneous UI designs across applications, existing vision-language models (VLMs) struggle to accurately infer these underlying operations. To bridge this gap, we introduce Teach VLM, a core model designed to "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.12817","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.12817/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.12817","created_at":"2026-06-12T01:08:52.568907+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.12817v1","created_at":"2026-06-12T01:08:52.568907+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.12817","created_at":"2026-06-12T01:08:52.568907+00:00"},{"alias_kind":"pith_short_12","alias_value":"RNG35ORSVDDF","created_at":"2026-06-12T01:08:52.568907+00:00"},{"alias_kind":"pith_short_16","alias_value":"RNG35ORSVDDFXKAS","created_at":"2026-06-12T01:08:52.568907+00:00"},{"alias_kind":"pith_short_8","alias_value":"RNG35ORS","created_at":"2026-06-12T01:08:52.568907+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4","json":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4.json","graph_json":"https://pith.science/api/pith-number/RNG35ORSVDDFXKASBFKKD72PU4/graph.json","events_json":"https://pith.science/api/pith-number/RNG35ORSVDDFXKASBFKKD72PU4/events.json","paper":"https://pith.science/paper/RNG35ORS"},"agent_actions":{"view_html":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4","download_json":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4.json","view_paper":"https://pith.science/paper/RNG35ORS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.12817&json=true","fetch_graph":"https://pith.science/api/pith-number/RNG35ORSVDDFXKASBFKKD72PU4/graph.json","fetch_events":"https://pith.science/api/pith-number/RNG35ORSVDDFXKASBFKKD72PU4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4/action/storage_attestation","attest_author":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4/action/author_attestation","sign_citation":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4/action/citation_signature","submit_replication":"https://pith.science/pith/RNG35ORSVDDFXKASBFKKD72PU4/action/replication_record"}},"created_at":"2026-06-12T01:08:52.568907+00:00","updated_at":"2026-06-12T01:08:52.568907+00:00"}