{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:LBNIFEJUZMTBQLSBNHPI6MOZKD","short_pith_number":"pith:LBNIFEJU","canonical_record":{"source":{"id":"2605.12957","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:43:02Z","cross_cats_sorted":[],"title_canon_sha256":"921cfe4edaa25919cb9d3d57457337900aa02a066643659230508a414daa01fb","abstract_canon_sha256":"9199662d2c2859de0f801a7e8b85791b55289e115a41ebbd7eea1f17dfb21783"},"schema_version":"1.0"},"canonical_sha256":"585a829134cb26182e4169de8f31d950d0e84fd767f478809a839a4d2e1efe7b","source":{"kind":"arxiv","id":"2605.12957","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12957","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12957v1","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12957","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"pith_short_12","alias_value":"LBNIFEJUZMTB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LBNIFEJUZMTBQLSB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LBNIFEJU","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:LBNIFEJUZMTBQLSBNHPI6MOZKD","target":"record","payload":{"canonical_record":{"source":{"id":"2605.12957","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:43:02Z","cross_cats_sorted":[],"title_canon_sha256":"921cfe4edaa25919cb9d3d57457337900aa02a066643659230508a414daa01fb","abstract_canon_sha256":"9199662d2c2859de0f801a7e8b85791b55289e115a41ebbd7eea1f17dfb21783"},"schema_version":"1.0"},"canonical_sha256":"585a829134cb26182e4169de8f31d950d0e84fd767f478809a839a4d2e1efe7b","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:09.274414Z","signature_b64":"LagLI+vfHujoeljIKWlNK8kNTAE9G9M2BI+X7KDJxvoyA/DsrOzDR1bSnLesxHoULAhoSZzarXCtFq0vnBlMAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"585a829134cb26182e4169de8f31d950d0e84fd767f478809a839a4d2e1efe7b","last_reissued_at":"2026-05-18T03:09:09.273688Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:09.273688Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.12957","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hxkdk5hV6dudd16fxpjDd4E4eegIx9ppux1swzf0PGk0t+9x5W7SL7xL3RUZseOBT8QAQEX4z5ZzgPleUxn5Dw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T05:37:54.252289Z"},"content_sha256":"fa2b11c6d378b7e2c09e1c8319cb39668588b26cc8ea2b3f76fe3f38dc66a44c","schema_version":"1.0","event_id":"sha256:fa2b11c6d378b7e2c09e1c8319cb39668588b26cc8ea2b3f76fe3f38dc66a44c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:LBNIFEJUZMTBQLSBNHPI6MOZKD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GTA: Advancing Image-to-3D World Generation via Geometry Then Appearance Video Diffusion","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"GTA generates 3D worlds from single images by first creating coarse geometry then synthesizing appearance with separate video diffusion models.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Cong Wang, Hanxin Zhu, Jiayi Luo, Peiyan Tu, Tianyu He, Xin Jin, Zhibo Chen","submitted_at":"2026-05-13T03:43:02Z","abstract_excerpt":"Recent developments in generative models and large-scale datasets have substantially advanced 3D world generation, facilitating a broad range of domains including spatial intelligence, embodied intelligence, and autonomous driving. While achieving remarkable progress, existing approaches to 3D world generation typically prioritize appearance prediction with limited modeling of the underlying geometry, leading to issues such as unreliable scene structure estimation and degraded cross-view consistency. To address these limitations, motivated by the coarse-to-fine nature of human visual perceptio"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"GTA adopts a two-stage framework with two dedicated video diffusion models, which first generate coarse geometric structure from novel viewpoints and then synthesize fine-grained appearance conditioned on the predicted geometry.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That separating geometry generation from appearance synthesis in a coarse-to-fine video diffusion pipeline will reliably improve structural fidelity and cross-view consistency without introducing new inconsistencies.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GTA generates 3D worlds from single images via a two-stage video diffusion process that prioritizes geometry before appearance to improve structural consistency.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"GTA generates 3D worlds from single images by first creating coarse geometry then synthesizing appearance with separate video diffusion models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a62790fb147a673d5fdaa5b00293f8fe491b77277e4879f24fd46042e83eb9b0"},"source":{"id":"2605.12957","kind":"arxiv","version":1},"verdict":{"id":"3291c294-2268-497d-b2bf-ed4bdca36779","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T19:35:05.350881Z","strongest_claim":"GTA adopts a two-stage framework with two dedicated video diffusion models, which first generate coarse geometric structure from novel viewpoints and then synthesize fine-grained appearance conditioned on the predicted geometry.","one_line_summary":"GTA generates 3D worlds from single images via a two-stage video diffusion process that prioritizes geometry before appearance to improve structural consistency.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That separating geometry generation from appearance synthesis in a coarse-to-fine video diffusion pipeline will reliably improve structural fidelity and cross-view consistency without introducing new inconsistencies.","pith_extraction_headline":"GTA generates 3D worlds from single images by first creating coarse geometry then synthesizing appearance with separate video diffusion models."},"references":{"count":87,"sample":[{"doi":"","year":2006,"title":"In: Proceedings of the First International Conference on Computer Vision Theory and Applications, pp","work_id":"1049d4e3-1462-47b2-bdb6-b02c4116ae81","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Advances in 3d generation: A survey","work_id":"5e518ad7-f98d-40e8-9b03-847e2ecb266a","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp","work_id":"e2d3ffde-f71b-4b91-95ac-9eae123393f9","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Artificial Intelligence Review56(9), 9175–9219 (2023)","work_id":"b88f7f1b-dc14-4655-b0cb-ed48eae5c305","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"3d scene genera- tion: A survey","work_id":"a1a77d70-f8f9-4076-add3-65d0886016fe","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":87,"snapshot_sha256":"5b7541b047f25304dc398ee2be93ab559600018e44e4955c476fe470dfeab840","internal_anchors":10},"formal_canon":{"evidence_count":2,"snapshot_sha256":"688edabb54f996601d4488add3980ac976670143151e3160ab02aa1b581c65f2"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"3291c294-2268-497d-b2bf-ed4bdca36779"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"zn2jhr7VC8MR4i8xGaS/JcM4qHrdqm7S/dIU4Qn3fGf5HifpND3O8RmiEha0jGo07WJQwJdLRNw8+M/X6cpJBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-04T05:37:54.252970Z"},"content_sha256":"41774442466d4bd3e9e86441d7df80eb0babf49d6f5445525e5703f64f1e8764","schema_version":"1.0","event_id":"sha256:41774442466d4bd3e9e86441d7df80eb0babf49d6f5445525e5703f64f1e8764"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/LBNIFEJUZMTBQLSBNHPI6MOZKD/bundle.json","state_url":"https://pith.science/pith/LBNIFEJUZMTBQLSBNHPI6MOZKD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/LBNIFEJUZMTBQLSBNHPI6MOZKD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-04T05:37:54Z","links":{"resolver":"https://pith.science/pith/LBNIFEJUZMTBQLSBNHPI6MOZKD","bundle":"https://pith.science/pith/LBNIFEJUZMTBQLSBNHPI6MOZKD/bundle.json","state":"https://pith.science/pith/LBNIFEJUZMTBQLSBNHPI6MOZKD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/LBNIFEJUZMTBQLSBNHPI6MOZKD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:LBNIFEJUZMTBQLSBNHPI6MOZKD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"9199662d2c2859de0f801a7e8b85791b55289e115a41ebbd7eea1f17dfb21783","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:43:02Z","title_canon_sha256":"921cfe4edaa25919cb9d3d57457337900aa02a066643659230508a414daa01fb"},"schema_version":"1.0","source":{"id":"2605.12957","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12957","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12957v1","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12957","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"pith_short_12","alias_value":"LBNIFEJUZMTB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LBNIFEJUZMTBQLSB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LBNIFEJU","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:41774442466d4bd3e9e86441d7df80eb0babf49d6f5445525e5703f64f1e8764","target":"graph","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"GTA adopts a two-stage framework with two dedicated video diffusion models, which first generate coarse geometric structure from novel viewpoints and then synthesize fine-grained appearance conditioned on the predicted geometry."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That separating geometry generation from appearance synthesis in a coarse-to-fine video diffusion pipeline will reliably improve structural fidelity and cross-view consistency without introducing new inconsistencies."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GTA generates 3D worlds from single images via a two-stage video diffusion process that prioritizes geometry before appearance to improve structural consistency."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"GTA generates 3D worlds from single images by first creating coarse geometry then synthesizing appearance with separate video diffusion models."}],"snapshot_sha256":"a62790fb147a673d5fdaa5b00293f8fe491b77277e4879f24fd46042e83eb9b0"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"688edabb54f996601d4488add3980ac976670143151e3160ab02aa1b581c65f2"},"paper":{"abstract_excerpt":"Recent developments in generative models and large-scale datasets have substantially advanced 3D world generation, facilitating a broad range of domains including spatial intelligence, embodied intelligence, and autonomous driving. While achieving remarkable progress, existing approaches to 3D world generation typically prioritize appearance prediction with limited modeling of the underlying geometry, leading to issues such as unreliable scene structure estimation and degraded cross-view consistency. To address these limitations, motivated by the coarse-to-fine nature of human visual perceptio","authors_text":"Cong Wang, Hanxin Zhu, Jiayi Luo, Peiyan Tu, Tianyu He, Xin Jin, Zhibo Chen","cross_cats":[],"headline":"GTA generates 3D worlds from single images by first creating coarse geometry then synthesizing appearance with separate video diffusion models.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:43:02Z","title":"GTA: Advancing Image-to-3D World Generation via Geometry Then Appearance Video Diffusion"},"references":{"count":87,"internal_anchors":10,"resolved_work":87,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"In: Proceedings of the First International Conference on Computer Vision Theory and Applications, pp","work_id":"1049d4e3-1462-47b2-bdb6-b02c4116ae81","year":2006},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Advances in 3d generation: A survey","work_id":"5e518ad7-f98d-40e8-9b03-847e2ecb266a","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp","work_id":"e2d3ffde-f71b-4b91-95ac-9eae123393f9","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Artificial Intelligence Review56(9), 9175–9219 (2023)","work_id":"b88f7f1b-dc14-4655-b0cb-ed48eae5c305","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"3d scene genera- tion: A survey","work_id":"a1a77d70-f8f9-4076-add3-65d0886016fe","year":2025}],"snapshot_sha256":"5b7541b047f25304dc398ee2be93ab559600018e44e4955c476fe470dfeab840"},"source":{"id":"2605.12957","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T19:35:05.350881Z","id":"3291c294-2268-497d-b2bf-ed4bdca36779","model_set":{"reader":"grok-4.3"},"one_line_summary":"GTA generates 3D worlds from single images via a two-stage video diffusion process that prioritizes geometry before appearance to improve structural consistency.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"GTA generates 3D worlds from single images by first creating coarse geometry then synthesizing appearance with separate video diffusion models.","strongest_claim":"GTA adopts a two-stage framework with two dedicated video diffusion models, which first generate coarse geometric structure from novel viewpoints and then synthesize fine-grained appearance conditioned on the predicted geometry.","weakest_assumption":"That separating geometry generation from appearance synthesis in a coarse-to-fine video diffusion pipeline will reliably improve structural fidelity and cross-view consistency without introducing new inconsistencies."}},"verdict_id":"3291c294-2268-497d-b2bf-ed4bdca36779"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:fa2b11c6d378b7e2c09e1c8319cb39668588b26cc8ea2b3f76fe3f38dc66a44c","target":"record","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"9199662d2c2859de0f801a7e8b85791b55289e115a41ebbd7eea1f17dfb21783","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:43:02Z","title_canon_sha256":"921cfe4edaa25919cb9d3d57457337900aa02a066643659230508a414daa01fb"},"schema_version":"1.0","source":{"id":"2605.12957","kind":"arxiv","version":1}},"canonical_sha256":"585a829134cb26182e4169de8f31d950d0e84fd767f478809a839a4d2e1efe7b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"585a829134cb26182e4169de8f31d950d0e84fd767f478809a839a4d2e1efe7b","first_computed_at":"2026-05-18T03:09:09.273688Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:09.273688Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"LagLI+vfHujoeljIKWlNK8kNTAE9G9M2BI+X7KDJxvoyA/DsrOzDR1bSnLesxHoULAhoSZzarXCtFq0vnBlMAA==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:09.274414Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12957","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:fa2b11c6d378b7e2c09e1c8319cb39668588b26cc8ea2b3f76fe3f38dc66a44c","sha256:41774442466d4bd3e9e86441d7df80eb0babf49d6f5445525e5703f64f1e8764"],"state_sha256":"916d91f07e2ab09566a1ed72db4480bf1b71336864a0e9d78afc25462515a633"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"MmB8QlH6kt1ZdQ238FVEoftGNhMY5vrvPxZx9QB+YszRgEllXu/yyUvkhlHS97yKfsM+wmKDC6zg6qYyK0N7Dw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-04T05:37:54.256997Z","bundle_sha256":"84e8b74ec2ee8a94e15c45a004964362c8dff3c665931d180ea2fcab481d9da9"}}