{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:4OXKXQ4PBCCPHOCVZK2PKPZTA2","short_pith_number":"pith:4OXKXQ4P","canonical_record":{"source":{"id":"2412.14164","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:58:50Z","cross_cats_sorted":[],"title_canon_sha256":"20c025291f77e7cde39c6acc6ed7ebea9dd3abbcaa26f3b99b95e26f8161a30c","abstract_canon_sha256":"76dfbffcf154571b952855eb877a4fa7e54ec04dfe3914dcbd04f4a201adbe57"},"schema_version":"1.0"},"canonical_sha256":"e3aeabc38f0884f3b855cab4f53f330696ad3beecbe15677a1497d065c6c83d6","source":{"kind":"arxiv","id":"2412.14164","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.14164","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2412.14164v1","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.14164","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"4OXKXQ4PBCCP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4OXKXQ4PBCCPHOCV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4OXKXQ4P","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:4OXKXQ4PBCCPHOCVZK2PKPZTA2","target":"record","payload":{"canonical_record":{"source":{"id":"2412.14164","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:58:50Z","cross_cats_sorted":[],"title_canon_sha256":"20c025291f77e7cde39c6acc6ed7ebea9dd3abbcaa26f3b99b95e26f8161a30c","abstract_canon_sha256":"76dfbffcf154571b952855eb877a4fa7e54ec04dfe3914dcbd04f4a201adbe57"},"schema_version":"1.0"},"canonical_sha256":"e3aeabc38f0884f3b855cab4f53f330696ad3beecbe15677a1497d065c6c83d6","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.674979Z","signature_b64":"L95J5m+1uDsuxFWQxqYMncs8QOqZFMcD0B0gT0zFw2gyY4jaQBew2aFkx+kbPi+B9HS2SvIl5eWexzi8tGEfBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e3aeabc38f0884f3b855cab4f53f330696ad3beecbe15677a1497d065c6c83d6","last_reissued_at":"2026-05-17T23:38:14.674381Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.674381Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2412.14164","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/iW9qd8Nj1zsUVR+dmAb7+7+nQkbh10v6GYFF2GceQxUqHE8XEUSrf3qiPNvgpErnXo2V0YTbvX/bvWAhkCkDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T21:26:51.168910Z"},"content_sha256":"ae5c1fbfbadf5fb73c0637a0f35ba9ec25e722339040732c11234674fef19c10","schema_version":"1.0","event_id":"sha256:ae5c1fbfbadf5fb73c0637a0f35ba9ec25e722339040732c11234674fef19c10"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:4OXKXQ4PBCCPHOCVZK2PKPZTA2","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MetaMorph: Multimodal Understanding and Generation via Instruction Tuning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Visual generation ability emerges as a natural byproduct of improved visual understanding in instruction-tuned LLMs.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"David Fan, Jiachen Zhu, Koustuv Sinha, Michael Rabbat, Saining Xie, Shengbang Tong, Xinlei Chen, Yann LeCun, Yunyang Xiong, Zhuang Liu","submitted_at":"2024-12-18T18:58:50Z","abstract_excerpt":"In this work, we propose Visual-Predictive Instruction Tuning (VPiT) - a simple and effective extension to visual instruction tuning that enables a pretrained LLM to quickly morph into an unified autoregressive model capable of generating both text and visual tokens. VPiT teaches an LLM to predict discrete text tokens and continuous visual tokens from any input sequence of image and text data curated in an instruction-following format. Our empirical investigation reveals several intriguing properties of VPiT: (1) visual generation ability emerges as a natural byproduct of improved visual under"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"visual generation ability emerges as a natural byproduct of improved visual understanding, and can be unlocked efficiently with a small amount of generation data","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the curated instruction-following multimodal datasets are sufficient to reveal general emergence of generation from understanding and that results will transfer beyond the specific models and data mixtures tested.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"VPiT enables pretrained LLMs to perform both visual understanding and generation by predicting discrete text tokens and continuous visual tokens, with understanding data proving more effective than generation-specific data.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Visual generation ability emerges as a natural byproduct of improved visual understanding in instruction-tuned LLMs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"05e3e8b72041543185682f573e78aa9d0dadf9912ae8421d658cbfec816c7586"},"source":{"id":"2412.14164","kind":"arxiv","version":1},"verdict":{"id":"63a59b41-11c4-4055-bd70-76d4fd244fa8","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T07:46:11.714221Z","strongest_claim":"visual generation ability emerges as a natural byproduct of improved visual understanding, and can be unlocked efficiently with a small amount of generation data","one_line_summary":"VPiT enables pretrained LLMs to perform both visual understanding and generation by predicting discrete text tokens and continuous visual tokens, with understanding data proving more effective than generation-specific data.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the curated instruction-following multimodal datasets are sufficient to reveal general emergence of generation from understanding and that results will transfer beyond the specific models and data mixtures tested.","pith_extraction_headline":"Visual generation ability emerges as a natural byproduct of improved visual understanding in instruction-tuned LLMs."},"references":{"count":282,"sample":[{"doi":"","year":2024,"title":"Llama 3 model card","work_id":"ac426759-fc95-4576-90e9-cad354462c5a","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Flamingo: a visual language model for few-shot learning","work_id":"90ed68c9-b335-4721-acb0-6953c1542432","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"ICML 2024 Tutorial: Physics of Language Models , 2024","work_id":"0cc1e582-245f-4283-bff2-fda235cc7dda","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Anthropic. Claude, 2024","work_id":"99efe9ce-d918-4dfa-8967-d5987496475d","ref_index":6,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"Jimmy Lei Ba, Jamie Kiros, and Geoffrey E. Hinton. Layer normalization. In NeurIPS, 2016","work_id":"47de663f-c464-4369-8f56-87c0dc0e9e56","ref_index":7,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":282,"snapshot_sha256":"3e2432c0bd6d74a623fc5e755abdc806532744a2b623f563cb874029a29bc76c","internal_anchors":32},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0132b43be650f9339225ad3466de67150b3c9e69708ec744be1ef730980b8430"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"63a59b41-11c4-4055-bd70-76d4fd244fa8"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"J9w/Im6X2LWME76PRLiNE2CIJhEwu+pyXiv+1FZH1R9KqbkdmSXI91ziB12k0oz/NDw7Bj9uD/o6xWSZ2WLkBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T21:26:51.169452Z"},"content_sha256":"3e747481f7c9afa65b3b1477f60c00007cf478875357973c1a23de9987e9a147","schema_version":"1.0","event_id":"sha256:3e747481f7c9afa65b3b1477f60c00007cf478875357973c1a23de9987e9a147"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/4OXKXQ4PBCCPHOCVZK2PKPZTA2/bundle.json","state_url":"https://pith.science/pith/4OXKXQ4PBCCPHOCVZK2PKPZTA2/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/4OXKXQ4PBCCPHOCVZK2PKPZTA2/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T21:26:51Z","links":{"resolver":"https://pith.science/pith/4OXKXQ4PBCCPHOCVZK2PKPZTA2","bundle":"https://pith.science/pith/4OXKXQ4PBCCPHOCVZK2PKPZTA2/bundle.json","state":"https://pith.science/pith/4OXKXQ4PBCCPHOCVZK2PKPZTA2/state.json","well_known_bundle":"https://pith.science/.well-known/pith/4OXKXQ4PBCCPHOCVZK2PKPZTA2/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:4OXKXQ4PBCCPHOCVZK2PKPZTA2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"76dfbffcf154571b952855eb877a4fa7e54ec04dfe3914dcbd04f4a201adbe57","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:58:50Z","title_canon_sha256":"20c025291f77e7cde39c6acc6ed7ebea9dd3abbcaa26f3b99b95e26f8161a30c"},"schema_version":"1.0","source":{"id":"2412.14164","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.14164","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2412.14164v1","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.14164","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"4OXKXQ4PBCCP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4OXKXQ4PBCCPHOCV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4OXKXQ4P","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:3e747481f7c9afa65b3b1477f60c00007cf478875357973c1a23de9987e9a147","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"visual generation ability emerges as a natural byproduct of improved visual understanding, and can be unlocked efficiently with a small amount of generation data"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the curated instruction-following multimodal datasets are sufficient to reveal general emergence of generation from understanding and that results will transfer beyond the specific models and data mixtures tested."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"VPiT enables pretrained LLMs to perform both visual understanding and generation by predicting discrete text tokens and continuous visual tokens, with understanding data proving more effective than generation-specific data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Visual generation ability emerges as a natural byproduct of improved visual understanding in instruction-tuned LLMs."}],"snapshot_sha256":"05e3e8b72041543185682f573e78aa9d0dadf9912ae8421d658cbfec816c7586"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0132b43be650f9339225ad3466de67150b3c9e69708ec744be1ef730980b8430"},"paper":{"abstract_excerpt":"In this work, we propose Visual-Predictive Instruction Tuning (VPiT) - a simple and effective extension to visual instruction tuning that enables a pretrained LLM to quickly morph into an unified autoregressive model capable of generating both text and visual tokens. VPiT teaches an LLM to predict discrete text tokens and continuous visual tokens from any input sequence of image and text data curated in an instruction-following format. Our empirical investigation reveals several intriguing properties of VPiT: (1) visual generation ability emerges as a natural byproduct of improved visual under","authors_text":"David Fan, Jiachen Zhu, Koustuv Sinha, Michael Rabbat, Saining Xie, Shengbang Tong, Xinlei Chen, Yann LeCun, Yunyang Xiong, Zhuang Liu","cross_cats":[],"headline":"Visual generation ability emerges as a natural byproduct of improved visual understanding in instruction-tuned LLMs.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:58:50Z","title":"MetaMorph: Multimodal Understanding and Generation via Instruction Tuning"},"references":{"count":282,"internal_anchors":32,"resolved_work":282,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Llama 3 model card","work_id":"ac426759-fc95-4576-90e9-cad354462c5a","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Flamingo: a visual language model for few-shot learning","work_id":"90ed68c9-b335-4721-acb0-6953c1542432","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"ICML 2024 Tutorial: Physics of Language Models , 2024","work_id":"0cc1e582-245f-4283-bff2-fda235cc7dda","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":6,"title":"Anthropic. Claude, 2024","work_id":"99efe9ce-d918-4dfa-8967-d5987496475d","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":7,"title":"Jimmy Lei Ba, Jamie Kiros, and Geoffrey E. Hinton. Layer normalization. In NeurIPS, 2016","work_id":"47de663f-c464-4369-8f56-87c0dc0e9e56","year":2016}],"snapshot_sha256":"3e2432c0bd6d74a623fc5e755abdc806532744a2b623f563cb874029a29bc76c"},"source":{"id":"2412.14164","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T07:46:11.714221Z","id":"63a59b41-11c4-4055-bd70-76d4fd244fa8","model_set":{"reader":"grok-4.3"},"one_line_summary":"VPiT enables pretrained LLMs to perform both visual understanding and generation by predicting discrete text tokens and continuous visual tokens, with understanding data proving more effective than generation-specific data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Visual generation ability emerges as a natural byproduct of improved visual understanding in instruction-tuned LLMs.","strongest_claim":"visual generation ability emerges as a natural byproduct of improved visual understanding, and can be unlocked efficiently with a small amount of generation data","weakest_assumption":"That the curated instruction-following multimodal datasets are sufficient to reveal general emergence of generation from understanding and that results will transfer beyond the specific models and data mixtures tested."}},"verdict_id":"63a59b41-11c4-4055-bd70-76d4fd244fa8"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ae5c1fbfbadf5fb73c0637a0f35ba9ec25e722339040732c11234674fef19c10","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"76dfbffcf154571b952855eb877a4fa7e54ec04dfe3914dcbd04f4a201adbe57","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:58:50Z","title_canon_sha256":"20c025291f77e7cde39c6acc6ed7ebea9dd3abbcaa26f3b99b95e26f8161a30c"},"schema_version":"1.0","source":{"id":"2412.14164","kind":"arxiv","version":1}},"canonical_sha256":"e3aeabc38f0884f3b855cab4f53f330696ad3beecbe15677a1497d065c6c83d6","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e3aeabc38f0884f3b855cab4f53f330696ad3beecbe15677a1497d065c6c83d6","first_computed_at":"2026-05-17T23:38:14.674381Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.674381Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"L95J5m+1uDsuxFWQxqYMncs8QOqZFMcD0B0gT0zFw2gyY4jaQBew2aFkx+kbPi+B9HS2SvIl5eWexzi8tGEfBA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.674979Z","signed_message":"canonical_sha256_bytes"},"source_id":"2412.14164","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ae5c1fbfbadf5fb73c0637a0f35ba9ec25e722339040732c11234674fef19c10","sha256:3e747481f7c9afa65b3b1477f60c00007cf478875357973c1a23de9987e9a147"],"state_sha256":"8023e7d2c3fa8588dbbb0d898be1be238643d15e88b34ed486dab13a52d97a01"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3OR9T25aZmHMen5QspZ5SW07H3YPScE5gRa3kW7EDjxks/+frfF3PQTI0u8vsfIBJEzqSOcq43SagifxBgU9BA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T21:26:51.171984Z","bundle_sha256":"f6f1d07ac34c3a71bf238d72e469b2bf6d87d94e24831aaaae7c8424c8e90ffa"}}