{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:B2NA3JF324PQW6LMGLFJ54RVDV","short_pith_number":"pith:B2NA3JF3","canonical_record":{"source":{"id":"2501.07542","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-01-13T18:23:57Z","cross_cats_sorted":["cs.CV","cs.LG"],"title_canon_sha256":"37cfa5d5cb1102bce80a85da6657c8f27044c9a7c4a40196b9aed375a5068f6a","abstract_canon_sha256":"fb9dd12f2813e9529e879c6373319691ab8b3b5b40155a077f85f959d28090e8"},"schema_version":"1.0"},"canonical_sha256":"0e9a0da4bbd71f0b796c32ca9ef2351d549a7882de4070b545bb9a883e501ede","source":{"kind":"arxiv","id":"2501.07542","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2501.07542","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2501.07542v1","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.07542","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"B2NA3JF324PQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B2NA3JF324PQW6LM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B2NA3JF3","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:B2NA3JF324PQW6LMGLFJ54RVDV","target":"record","payload":{"canonical_record":{"source":{"id":"2501.07542","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-01-13T18:23:57Z","cross_cats_sorted":["cs.CV","cs.LG"],"title_canon_sha256":"37cfa5d5cb1102bce80a85da6657c8f27044c9a7c4a40196b9aed375a5068f6a","abstract_canon_sha256":"fb9dd12f2813e9529e879c6373319691ab8b3b5b40155a077f85f959d28090e8"},"schema_version":"1.0"},"canonical_sha256":"0e9a0da4bbd71f0b796c32ca9ef2351d549a7882de4070b545bb9a883e501ede","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.287855Z","signature_b64":"z4XgmNgyYXpRrVSW2C6VPuHr3LBE1MLgx4gXmmcYDDoGaFlqOoEyTMK4NlRlQCHsjzUMLqJofT7IExxMkicTDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0e9a0da4bbd71f0b796c32ca9ef2351d549a7882de4070b545bb9a883e501ede","last_reissued_at":"2026-05-17T23:38:46.287290Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.287290Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2501.07542","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"mECCCdcslWE0LZuIbebwg5pr9YSjq3dNyG4+BWDSFQsY4hGAxK9dVh8UxBlDw+gxkXUeMzfU8e2ouMToBgdtAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T22:52:03.776590Z"},"content_sha256":"9caf00a7a2d95f4b4e885224341a24c5a3e38f5becb3f141a24c806b0af32077","schema_version":"1.0","event_id":"sha256:9caf00a7a2d95f4b4e885224341a24c5a3e38f5becb3f141a24c806b0af32077"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:B2NA3JF324PQW6LMGLFJ54RVDV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Imagine while Reasoning in Space: Multimodal Visualization-of-Thought","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Multimodal models can improve spatial reasoning by generating images that visualize their step-by-step thinking process.","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Chengzu Li, Furu Wei, Huanyu Zhang, Ivan Vuli\\'c, Li Dong, Shaoguang Mao, Wenshan Wu, Yan Xia","submitted_at":"2025-01-13T18:23:57Z","abstract_excerpt":"Chain-of-Thought (CoT) prompting has proven highly effective for enhancing complex reasoning in Large Language Models (LLMs) and Multimodal Large Language Models (MLLMs). Yet, it struggles in complex spatial reasoning tasks. Nonetheless, human cognition extends beyond language alone, enabling the remarkable capability to think in both words and images. Inspired by this mechanism, we propose a new reasoning paradigm, Multimodal Visualization-of-Thought (MVoT). It enables visual thinking in MLLMs by generating image visualizations of their reasoning traces. To ensure high-quality visualization, "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experimental results reveal that MVoT demonstrates competitive performance across tasks. Moreover, it exhibits robust and reliable improvements in the most challenging scenarios where CoT fails.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the generated visualizations faithfully capture the model's internal reasoning state and that the token discrepancy loss produces images that actually aid downstream reasoning rather than introducing new errors or hallucinations.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MVoT lets multimodal models create coherent images during chain-of-thought reasoning via a token discrepancy loss, yielding competitive or better results than text-only CoT on dynamic spatial tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Multimodal models can improve spatial reasoning by generating images that visualize their step-by-step thinking process.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"d05676f43fc17da50ae731b87e4daccbc12a5c3d9bafcd1b11841f0a0db52567"},"source":{"id":"2501.07542","kind":"arxiv","version":1},"verdict":{"id":"e191d9df-90c4-4483-add8-756f05c30152","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T23:05:46.601203Z","strongest_claim":"Experimental results reveal that MVoT demonstrates competitive performance across tasks. Moreover, it exhibits robust and reliable improvements in the most challenging scenarios where CoT fails.","one_line_summary":"MVoT lets multimodal models create coherent images during chain-of-thought reasoning via a token discrepancy loss, yielding competitive or better results than text-only CoT on dynamic spatial tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the generated visualizations faithfully capture the model's internal reasoning state and that the token discrepancy loss produces images that actually aid downstream reasoning rather than introducing new errors or hallucinations.","pith_extraction_headline":"Multimodal models can improve spatial reasoning by generating images that visualize their step-by-step thinking process."},"references":{"count":29,"sample":[{"doi":"","year":null,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":1,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":null,"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","ref_index":2,"cited_arxiv_id":"2311.15127","is_internal_anchor":true},{"doi":"","year":null,"title":"[Bro16] G Brockman. Openai gym. arXiv preprint arXiv:1606.01540,","work_id":"6af98f3f-f074-41ae-a689-7dd7b4b8efde","ref_index":3,"cited_arxiv_id":"1606.01540","is_internal_anchor":true},{"doi":"","year":null,"title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models","work_id":"2661b9a6-25cc-41a1-8100-612d2b801289","ref_index":4,"cited_arxiv_id":"2405.09818","is_internal_anchor":true},{"doi":"","year":null,"title":"Anole: An open, autoregressive, native large multimodal models for interleaved image-text generation","work_id":"31ce9d99-2071-41a0-9f51-51b8c5e3ba7e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":29,"snapshot_sha256":"07d607fc60270b4d125ea7e97eb5b9f59958560e3273587c095f2e7a038a1bf5","internal_anchors":12},"formal_canon":{"evidence_count":2,"snapshot_sha256":"ecc54a3a90f53f8a5a1aad54ca67c7b2d460ccc9f69d49078382d0a037f53e9a"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"e191d9df-90c4-4483-add8-756f05c30152"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Ui/o3MOQOLp464NGSyItUqElyJ5Lvs+7bxA5jwXDErDBVbrNlXh+cQsQ3Utancki5zHmF2JA5J1wwpwosq8NAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T22:52:03.777434Z"},"content_sha256":"b181878faff4ec6d1191aa4b40e8d91fd7d1614935a23b52913dd0e54e44cd9d","schema_version":"1.0","event_id":"sha256:b181878faff4ec6d1191aa4b40e8d91fd7d1614935a23b52913dd0e54e44cd9d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/B2NA3JF324PQW6LMGLFJ54RVDV/bundle.json","state_url":"https://pith.science/pith/B2NA3JF324PQW6LMGLFJ54RVDV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/B2NA3JF324PQW6LMGLFJ54RVDV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-29T22:52:03Z","links":{"resolver":"https://pith.science/pith/B2NA3JF324PQW6LMGLFJ54RVDV","bundle":"https://pith.science/pith/B2NA3JF324PQW6LMGLFJ54RVDV/bundle.json","state":"https://pith.science/pith/B2NA3JF324PQW6LMGLFJ54RVDV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/B2NA3JF324PQW6LMGLFJ54RVDV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:B2NA3JF324PQW6LMGLFJ54RVDV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"fb9dd12f2813e9529e879c6373319691ab8b3b5b40155a077f85f959d28090e8","cross_cats_sorted":["cs.CV","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-01-13T18:23:57Z","title_canon_sha256":"37cfa5d5cb1102bce80a85da6657c8f27044c9a7c4a40196b9aed375a5068f6a"},"schema_version":"1.0","source":{"id":"2501.07542","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2501.07542","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2501.07542v1","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.07542","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"B2NA3JF324PQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B2NA3JF324PQW6LM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B2NA3JF3","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:b181878faff4ec6d1191aa4b40e8d91fd7d1614935a23b52913dd0e54e44cd9d","target":"graph","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results reveal that MVoT demonstrates competitive performance across tasks. Moreover, it exhibits robust and reliable improvements in the most challenging scenarios where CoT fails."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the generated visualizations faithfully capture the model's internal reasoning state and that the token discrepancy loss produces images that actually aid downstream reasoning rather than introducing new errors or hallucinations."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MVoT lets multimodal models create coherent images during chain-of-thought reasoning via a token discrepancy loss, yielding competitive or better results than text-only CoT on dynamic spatial tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Multimodal models can improve spatial reasoning by generating images that visualize their step-by-step thinking process."}],"snapshot_sha256":"d05676f43fc17da50ae731b87e4daccbc12a5c3d9bafcd1b11841f0a0db52567"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"ecc54a3a90f53f8a5a1aad54ca67c7b2d460ccc9f69d49078382d0a037f53e9a"},"paper":{"abstract_excerpt":"Chain-of-Thought (CoT) prompting has proven highly effective for enhancing complex reasoning in Large Language Models (LLMs) and Multimodal Large Language Models (MLLMs). Yet, it struggles in complex spatial reasoning tasks. Nonetheless, human cognition extends beyond language alone, enabling the remarkable capability to think in both words and images. Inspired by this mechanism, we propose a new reasoning paradigm, Multimodal Visualization-of-Thought (MVoT). It enables visual thinking in MLLMs by generating image visualizations of their reasoning traces. To ensure high-quality visualization, ","authors_text":"Chengzu Li, Furu Wei, Huanyu Zhang, Ivan Vuli\\'c, Li Dong, Shaoguang Mao, Wenshan Wu, Yan Xia","cross_cats":["cs.CV","cs.LG"],"headline":"Multimodal models can improve spatial reasoning by generating images that visualize their step-by-step thinking process.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-01-13T18:23:57Z","title":"Imagine while Reasoning in Space: Multimodal Visualization-of-Thought"},"references":{"count":29,"internal_anchors":12,"resolved_work":29,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":null},{"cited_arxiv_id":"2311.15127","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","year":null},{"cited_arxiv_id":"1606.01540","doi":"","is_internal_anchor":true,"ref_index":3,"title":"[Bro16] G Brockman. Openai gym. arXiv preprint arXiv:1606.01540,","work_id":"6af98f3f-f074-41ae-a689-7dd7b4b8efde","year":null},{"cited_arxiv_id":"2405.09818","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models","work_id":"2661b9a6-25cc-41a1-8100-612d2b801289","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Anole: An open, autoregressive, native large multimodal models for interleaved image-text generation","work_id":"31ce9d99-2071-41a0-9f51-51b8c5e3ba7e","year":null}],"snapshot_sha256":"07d607fc60270b4d125ea7e97eb5b9f59958560e3273587c095f2e7a038a1bf5"},"source":{"id":"2501.07542","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-16T23:05:46.601203Z","id":"e191d9df-90c4-4483-add8-756f05c30152","model_set":{"reader":"grok-4.3"},"one_line_summary":"MVoT lets multimodal models create coherent images during chain-of-thought reasoning via a token discrepancy loss, yielding competitive or better results than text-only CoT on dynamic spatial tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Multimodal models can improve spatial reasoning by generating images that visualize their step-by-step thinking process.","strongest_claim":"Experimental results reveal that MVoT demonstrates competitive performance across tasks. Moreover, it exhibits robust and reliable improvements in the most challenging scenarios where CoT fails.","weakest_assumption":"That the generated visualizations faithfully capture the model's internal reasoning state and that the token discrepancy loss produces images that actually aid downstream reasoning rather than introducing new errors or hallucinations."}},"verdict_id":"e191d9df-90c4-4483-add8-756f05c30152"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9caf00a7a2d95f4b4e885224341a24c5a3e38f5becb3f141a24c806b0af32077","target":"record","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"fb9dd12f2813e9529e879c6373319691ab8b3b5b40155a077f85f959d28090e8","cross_cats_sorted":["cs.CV","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-01-13T18:23:57Z","title_canon_sha256":"37cfa5d5cb1102bce80a85da6657c8f27044c9a7c4a40196b9aed375a5068f6a"},"schema_version":"1.0","source":{"id":"2501.07542","kind":"arxiv","version":1}},"canonical_sha256":"0e9a0da4bbd71f0b796c32ca9ef2351d549a7882de4070b545bb9a883e501ede","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0e9a0da4bbd71f0b796c32ca9ef2351d549a7882de4070b545bb9a883e501ede","first_computed_at":"2026-05-17T23:38:46.287290Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:46.287290Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"z4XgmNgyYXpRrVSW2C6VPuHr3LBE1MLgx4gXmmcYDDoGaFlqOoEyTMK4NlRlQCHsjzUMLqJofT7IExxMkicTDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:46.287855Z","signed_message":"canonical_sha256_bytes"},"source_id":"2501.07542","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9caf00a7a2d95f4b4e885224341a24c5a3e38f5becb3f141a24c806b0af32077","sha256:b181878faff4ec6d1191aa4b40e8d91fd7d1614935a23b52913dd0e54e44cd9d"],"state_sha256":"8987cf526658132e7f87c31fdf4a62faa68d14e5c96dbd8e28a406c9bbd4ae6d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RgroG23X9qP1tbdFSRAiXO0U5TUuaampIhqlJblgJQ58j1clMVxGMNl8PeDEY/uBk5gFaZcW4LonLVCCtKmiCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-29T22:52:03.781212Z","bundle_sha256":"d135371bc32291c42a82494c72e12be4f01001bfc69e51832dbda29d7b7279bb"}}