{"work":{"id":"e0cfd82c-f5d4-44fd-b531-ec73ab0a805b","openalex_id":null,"doi":null,"arxiv_id":"2505.14683","raw_key":null,"title":"Emerging Properties in Unified Multimodal Pretraining","authors":null,"authors_text":"Chaorui Deng, Deyao Zhu, Kunchang Li, Chenhui Gou, Feng Li, Zeyu Wang","year":2025,"venue":"cs.CV","abstract":"Unifying multimodal understanding and generation has shown impressive capabilities in cutting-edge proprietary systems. In this work, we introduce BAGEL, an open-source foundational model that natively supports multimodal understanding and generation. BAGEL is a unified, decoder-only model pretrained on trillions of tokens curated from large-scale interleaved text, image, video, and web data. When scaled with such diverse multimodal interleaved data, BAGEL exhibits emerging capabilities in complex multimodal reasoning. As a result, it significantly outperforms open-source unified models in both multimodal generation and understanding across standard benchmarks, while exhibiting advanced multimodal reasoning abilities such as free-form image manipulation, future frame prediction, 3D manipulation, and world navigation. In the hope of facilitating further opportunities for multimodal research, we share the key findings, pretraining details, data creation protocal, and release our code and checkpoints to the community. The project page is at https://bagel-ai.org/","external_url":"https://arxiv.org/abs/2505.14683","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-14T20:22:54.688488+00:00","pith_arxiv_id":"2505.14683","created_at":"2026-05-09T06:25:47.756624+00:00","updated_at":"2026-05-14T20:22:54.688488+00:00","title_quality_ok":true,"display_title":"Emerging Properties in Unified Multimodal Pretraining","render_title":"Emerging Properties in Unified Multimodal Pretraining"},"hub":{"state":{"work_id":"e0cfd82c-f5d4-44fd-b531-ec73ab0a805b","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":93,"external_cited_by_count":null,"distinct_field_count":8,"first_pith_cited_at":"2025-06-03T17:59:33+00:00","last_pith_cited_at":"2026-05-13T06:33:54+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-14T20:46:11.240149+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":1}],"polarity_counts":[{"context_polarity":"background","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T05:56:46.539908+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling","work_id":"67d9e391-26d1-459e-ab56-07e60511c886","shared_citers":36},{"title":"Qwen-Image Technical Report","work_id":"d06d7ecc-7579-4f89-a60b-4278a0f3c562","shared_citers":33},{"title":"OmniGen2: Towards Instruction-Aligned Multimodal Generation","work_id":"d3153e5f-b6e2-4ab3-9f41-e24e24d64496","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":30},{"title":"BLIP3-o: A Family of Fully Open Unified Multimodal Models-Architecture, Training and Dataset","work_id":"86d896d2-592f-4d9b-938e-dfeb11f9388f","shared_citers":29},{"title":"Emu3: Next-Token Prediction is All You Need","work_id":"720d288e-fac0-464c-9929-19efd9a52afc","shared_citers":29},{"title":"Step1X-Edit: A Practical Framework for General Image Editing","work_id":"3392f2c8-a1cb-4d6c-8c82-2cdccffa33f9","shared_citers":27},{"title":"UniWorld-V1: High-Resolution Semantic Encoders for Unified Visual Understanding and Generation","work_id":"488a273e-95d8-46f1-87c7-2244068d00d0","shared_citers":27},{"title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models","work_id":"2661b9a6-25cc-41a1-8100-612d2b801289","shared_citers":25},{"title":"FLUX.1 Kontext: Flow Matching for In-Context Image Generation and Editing in Latent Space","work_id":"5dfe19d5-3541-4803-8fe9-3c8b9e29b281","shared_citers":21},{"title":"Show-o2: Improved Native Unified Multimodal Models","work_id":"77f00563-1ce6-4fba-9d4e-c8ce83f716ac","shared_citers":21},{"title":"Show-o: One Single Transformer to Unify Multimodal Understanding and Generation","work_id":"1393dc24-a6b2-44e1-b5d7-7009d1fa4811","shared_citers":21},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":20},{"title":"ImgEdit: A Unified Image Editing Dataset and Benchmark","work_id":"059b5c3a-404c-4d30-a631-68c1d88a08a7","shared_citers":19},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":18},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":18},{"title":"ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment","work_id":"94248955-4bc5-4517-98a0-66224a36d865","shared_citers":17},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":17},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":16},{"title":"Seedream 4.0: Toward Next-generation Multimodal Image Generation","work_id":"15c839a0-48a3-4218-82b6-cac5b7f66e13","shared_citers":16},{"title":"Seedream 3.0 Technical Report","work_id":"013e56d0-7f47-4d0e-bbca-e9540fc0e0cc","shared_citers":13},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":12},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":12},{"title":"Mogao: An omni foundation model for interleaved multi-modal generation","work_id":"f2badd0e-c06a-45f9-9d9e-7ceda62176b8","shared_citers":12}],"time_series":[{"n":7,"year":2025},{"n":85,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T05:56:44.498494+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T05:56:44.376066+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Emerging Properties in Unified Multimodal Pretraining","claims":[{"claim_text":"Unifying multimodal understanding and generation has shown impressive capabilities in cutting-edge proprietary systems. In this work, we introduce BAGEL, an open-source foundational model that natively supports multimodal understanding and generation. BAGEL is a unified, decoder-only model pretrained on trillions of tokens curated from large-scale interleaved text, image, video, and web data. When scaled with such diverse multimodal interleaved data, BAGEL exhibits emerging capabilities in complex multimodal reasoning. As a result, it significantly outperforms open-source unified models in bot","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Emerging Properties in Unified Multimodal Pretraining because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T05:56:36.621698+00:00"}},"summary":{"title":"Emerging Properties in Unified Multimodal Pretraining","claims":[{"claim_text":"Unifying multimodal understanding and generation has shown impressive capabilities in cutting-edge proprietary systems. In this work, we introduce BAGEL, an open-source foundational model that natively supports multimodal understanding and generation. BAGEL is a unified, decoder-only model pretrained on trillions of tokens curated from large-scale interleaved text, image, video, and web data. When scaled with such diverse multimodal interleaved data, BAGEL exhibits emerging capabilities in complex multimodal reasoning. As a result, it significantly outperforms open-source unified models in bot","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Emerging Properties in Unified Multimodal Pretraining because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling","work_id":"67d9e391-26d1-459e-ab56-07e60511c886","shared_citers":36},{"title":"Qwen-Image Technical Report","work_id":"d06d7ecc-7579-4f89-a60b-4278a0f3c562","shared_citers":33},{"title":"OmniGen2: Towards Instruction-Aligned Multimodal Generation","work_id":"d3153e5f-b6e2-4ab3-9f41-e24e24d64496","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":30},{"title":"BLIP3-o: A Family of Fully Open Unified Multimodal Models-Architecture, Training and Dataset","work_id":"86d896d2-592f-4d9b-938e-dfeb11f9388f","shared_citers":29},{"title":"Emu3: Next-Token Prediction is All You Need","work_id":"720d288e-fac0-464c-9929-19efd9a52afc","shared_citers":29},{"title":"Step1X-Edit: A Practical Framework for General Image Editing","work_id":"3392f2c8-a1cb-4d6c-8c82-2cdccffa33f9","shared_citers":27},{"title":"UniWorld-V1: High-Resolution Semantic Encoders for Unified Visual Understanding and Generation","work_id":"488a273e-95d8-46f1-87c7-2244068d00d0","shared_citers":27},{"title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models","work_id":"2661b9a6-25cc-41a1-8100-612d2b801289","shared_citers":25},{"title":"FLUX.1 Kontext: Flow Matching for In-Context Image Generation and Editing in Latent Space","work_id":"5dfe19d5-3541-4803-8fe9-3c8b9e29b281","shared_citers":21},{"title":"Show-o2: Improved Native Unified Multimodal Models","work_id":"77f00563-1ce6-4fba-9d4e-c8ce83f716ac","shared_citers":21},{"title":"Show-o: One Single Transformer to Unify Multimodal Understanding and Generation","work_id":"1393dc24-a6b2-44e1-b5d7-7009d1fa4811","shared_citers":21},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":20},{"title":"ImgEdit: A Unified Image Editing Dataset and Benchmark","work_id":"059b5c3a-404c-4d30-a631-68c1d88a08a7","shared_citers":19},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":18},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":18},{"title":"ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment","work_id":"94248955-4bc5-4517-98a0-66224a36d865","shared_citers":17},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":17},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":16},{"title":"Seedream 4.0: Toward Next-generation Multimodal Image Generation","work_id":"15c839a0-48a3-4218-82b6-cac5b7f66e13","shared_citers":16},{"title":"Seedream 3.0 Technical Report","work_id":"013e56d0-7f47-4d0e-bbca-e9540fc0e0cc","shared_citers":13},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":12},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":12},{"title":"Mogao: An omni foundation model for interleaved multi-modal generation","work_id":"f2badd0e-c06a-45f9-9d9e-7ceda62176b8","shared_citers":12}],"time_series":[{"n":7,"year":2025},{"n":85,"year":2026}],"dependency_candidates":[]},"authors":[]}}