{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:72RVQLYD5V7J3NSCVLZN5J7KM5","short_pith_number":"pith:72RVQLYD","schema_version":"1.0","canonical_sha256":"fea3582f03ed7e9db642aaf2dea7ea676b9d3ed0575a18ef4c03510b64944f00","source":{"kind":"arxiv","id":"2607.00020","version":1},"attestation_state":"computed","paper":{"title":"EmbodimentSemantic: A Spatial Scene-Graph Dataset and Benchmark for Vision-Language Models on Embodied Manipulation Trajectories","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Christopher E. Mower, Haitham Bou-Ammar, Hassan Jaber, Luca Cagliero, Refinath S N","submitted_at":"2026-06-06T18:58:54Z","abstract_excerpt":"Spatial grounding remains a key limitation of vision-language-action (VLA) systems for robotic manipulation. While current models can recognize objects and follow language instructions, they often lack an explicit representation of how objects are arranged in space, including support, containment, ordering, occlusion, and depth-sensitive relations. We introduce EmbodimentSemantic, a spatial scene-graph dataset and benchmark for evaluating relational grounding in embodied manipulation. EmbodimentSemantic represents scenes as directed object-relation-object triplets, where each triplet specifies"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2607.00020","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2026-06-06T18:58:54Z","cross_cats_sorted":[],"title_canon_sha256":"b07203564904b476a974919938095166ff8adb10766df6ab9b9fa17ae3cf35b7","abstract_canon_sha256":"4c68745571e6732c370b5a9f5421be4cb6b3ed29d2b8da892b4b0fe393a9a968"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-02T00:18:31.417702Z","signature_b64":"GHx2y1LCYk4f2/4GKoIsixqyhN3+AK1UrDpD6kPEwhscAoYRdcbRsEw2Bh5TLwXe7Ioz8/J0p1Nkh1zNWSUPCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fea3582f03ed7e9db642aaf2dea7ea676b9d3ed0575a18ef4c03510b64944f00","last_reissued_at":"2026-07-02T00:18:31.416888Z","signature_status":"signed_v1","first_computed_at":"2026-07-02T00:18:31.416888Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"EmbodimentSemantic: A Spatial Scene-Graph Dataset and Benchmark for Vision-Language Models on Embodied Manipulation Trajectories","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Christopher E. Mower, Haitham Bou-Ammar, Hassan Jaber, Luca Cagliero, Refinath S N","submitted_at":"2026-06-06T18:58:54Z","abstract_excerpt":"Spatial grounding remains a key limitation of vision-language-action (VLA) systems for robotic manipulation. While current models can recognize objects and follow language instructions, they often lack an explicit representation of how objects are arranged in space, including support, containment, ordering, occlusion, and depth-sensitive relations. We introduce EmbodimentSemantic, a spatial scene-graph dataset and benchmark for evaluating relational grounding in embodied manipulation. EmbodimentSemantic represents scenes as directed object-relation-object triplets, where each triplet specifies"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2607.00020","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2607.00020/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2607.00020","created_at":"2026-07-02T00:18:31.417037+00:00"},{"alias_kind":"arxiv_version","alias_value":"2607.00020v1","created_at":"2026-07-02T00:18:31.417037+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2607.00020","created_at":"2026-07-02T00:18:31.417037+00:00"},{"alias_kind":"pith_short_12","alias_value":"72RVQLYD5V7J","created_at":"2026-07-02T00:18:31.417037+00:00"},{"alias_kind":"pith_short_16","alias_value":"72RVQLYD5V7J3NSC","created_at":"2026-07-02T00:18:31.417037+00:00"},{"alias_kind":"pith_short_8","alias_value":"72RVQLYD","created_at":"2026-07-02T00:18:31.417037+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5","json":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5.json","graph_json":"https://pith.science/api/pith-number/72RVQLYD5V7J3NSCVLZN5J7KM5/graph.json","events_json":"https://pith.science/api/pith-number/72RVQLYD5V7J3NSCVLZN5J7KM5/events.json","paper":"https://pith.science/paper/72RVQLYD"},"agent_actions":{"view_html":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5","download_json":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5.json","view_paper":"https://pith.science/paper/72RVQLYD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2607.00020&json=true","fetch_graph":"https://pith.science/api/pith-number/72RVQLYD5V7J3NSCVLZN5J7KM5/graph.json","fetch_events":"https://pith.science/api/pith-number/72RVQLYD5V7J3NSCVLZN5J7KM5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5/action/storage_attestation","attest_author":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5/action/author_attestation","sign_citation":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5/action/citation_signature","submit_replication":"https://pith.science/pith/72RVQLYD5V7J3NSCVLZN5J7KM5/action/replication_record"}},"created_at":"2026-07-02T00:18:31.417037+00:00","updated_at":"2026-07-02T00:18:31.417037+00:00"}