{"work":{"id":"f38fc088-12aa-4bf4-9ecd-08d3e797ccb7","openalex_id":null,"doi":null,"arxiv_id":"2408.06072","raw_key":null,"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","authors":null,"authors_text":"Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu","year":2024,"venue":"cs.CV","abstract":"We present CogVideoX, a large-scale text-to-video generation model based on diffusion transformer, which can generate 10-second continuous videos aligned with text prompt, with a frame rate of 16 fps and resolution of 768 * 1360 pixels. Previous video generation models often had limited movement and short durations, and is difficult to generate videos with coherent narratives based on text. We propose several designs to address these issues. First, we propose a 3D Variational Autoencoder (VAE) to compress videos along both spatial and temporal dimensions, to improve both compression rate and video fidelity. Second, to improve the text-video alignment, we propose an expert transformer with the expert adaptive LayerNorm to facilitate the deep fusion between the two modalities. Third, by employing a progressive training and multi-resolution frame pack technique, CogVideoX is adept at producing coherent, long-duration, different shape videos characterized by significant motions. In addition, we develop an effective text-video data processing pipeline that includes various data preprocessing strategies and a video captioning method, greatly contributing to the generation quality and semantic alignment. Results show that CogVideoX demonstrates state-of-the-art performance across both multiple machine metrics and human evaluations. The model weight of both 3D Causal VAE, Video caption model and CogVideoX are publicly available at https://github.com/THUDM/CogVideo.","external_url":"https://arxiv.org/abs/2408.06072","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T06:35:24.998079+00:00","pith_arxiv_id":"2408.06072","created_at":"2026-05-09T05:55:31.698281+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":true,"display_title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","render_title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer"},"hub":{"state":{"work_id":"f38fc088-12aa-4bf4-9ecd-08d3e797ccb7","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":223,"external_cited_by_count":null,"distinct_field_count":9,"first_pith_cited_at":"2024-04-02T16:52:41+00:00","last_pith_cited_at":"2026-05-22T17:59:43+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-11T02:07:14.571869+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":60},{"context_role":"method","n":9},{"context_role":"baseline","n":7},{"context_role":"dataset","n":1}],"polarity_counts":[{"context_polarity":"background","n":58},{"context_polarity":"use_method","n":9},{"context_polarity":"baseline","n":7},{"context_polarity":"unclear","n":2},{"context_polarity":"use_dataset","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","claims":[{"claim_text":"We present CogVideoX, a large-scale text-to-video generation model based on diffusion transformer, which can generate 10-second continuous videos aligned with text prompt, with a frame rate of 16 fps and resolution of 768 * 1360 pixels. Previous video generation models often had limited movement and short durations, and is difficult to generate videos with coherent narratives based on text. We propose several designs to address these issues. First, we propose a 3D Variational Autoencoder (VAE) to compress videos along both spatial and temporal dimensions, to improve both compression rate and v","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:24:08.930949+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"633203cd-6619-41f4-b0ce-0af13a309da0","orcid":null,"display_name":"Zhuoyi Yang"},{"id":"9678cb7f-19e8-4fdc-b9ab-ad5b320b3877","orcid":null,"display_name":"Jiayan Teng"},{"id":"bb6b9074-fec5-4625-8a3d-e5d9bb5fa1fb","orcid":null,"display_name":"Wendi Zheng"},{"id":"7d458b38-c708-43b9-95e2-b5fa30e5ffef","orcid":null,"display_name":"Ming Ding"},{"id":"95ab0776-6cb0-41a1-938f-9aeb8e23c742","orcid":null,"display_name":"Shiyu Huang"},{"id":"725cfe56-37fc-4242-a2f6-390c93cafb01","orcid":null,"display_name":"Jiazheng Xu"}]},"error":null,"updated_at":"2026-05-14T02:24:14.212605+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T02:24:13.092520+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":73},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":52},{"title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","work_id":"881efa7e-7e73-4c66-9cc3-2803e551061c","shared_citers":48},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":29},{"title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning","work_id":"1f9d1d3b-a6d6-45a9-9f13-51393c03be8a","shared_citers":19},{"title":"Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow","work_id":"a1989e1b-d66d-4533-be3a-fb9c5fd62290","shared_citers":17},{"title":"Open-Sora: Democratizing Efficient Video Production for All","work_id":"8b29ba7b-3d84-4281-85b7-9eaf905afd7f","shared_citers":17},{"title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","work_id":"2dbd6bcd-fc98-4fbf-b586-f6d94fe1abd2","shared_citers":16},{"title":"Movie Gen: A Cast of Media Foundation Models","work_id":"a6a118b0-002f-4b19-881f-7f1183e0d7d8","shared_citers":16},{"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","shared_citers":14},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":14},{"title":"Seedance 1.0: Exploring the Boundaries of Video Generation Models","work_id":"b2e36b5d-99e4-45b4-9358-64f6d3501983","shared_citers":14},{"title":"Self Forcing: Bridging the Train-Test Gap in Autoregressive Video Diffusion","work_id":"53e58ef9-7932-4b83-b757-34ac14db3e0f","shared_citers":14},{"title":"CameraCtrl: Enabling Camera Control for Text-to-Video Generation","work_id":"1c05c278-c023-4ef0-a359-25a41f1065eb","shared_citers":13},{"title":"LTX-Video: Realtime Video Latent Diffusion","work_id":"cee5c521-3ce9-466e-a035-1e42f89254f4","shared_citers":13},{"title":"Qwen-Image Technical Report","work_id":"d06d7ecc-7579-4f89-a60b-4278a0f3c562","shared_citers":13},{"title":"Towards Accurate Generative Models of Video: A New Metric & Challenges","work_id":"72f42543-17d5-49aa-ba5a-25d67ffbb88a","shared_citers":13},{"title":"Classifier-Free Diffusion Guidance","work_id":"acf2c588-c088-4a6c-938e-150ad7c666d7","shared_citers":12},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":12},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":12},{"title":"Make-A-Video: Text-to-Video Generation without Text-Video Data","work_id":"52a801fc-a707-45a1-a8cd-0d6702f124ab","shared_citers":12},{"title":"ModelScope Text-to-Video Technical Report","work_id":"1b1baf78-58ec-44d0-b700-84dff57b2f1f","shared_citers":12},{"title":"Score-Based Generative Modeling through Stochastic Differential Equations","work_id":"d9110e53-a5d4-4794-a4c5-a575e91c31ad","shared_citers":12},{"title":"Denoising Diffusion Implicit Models","work_id":"8fa2128b-d18c-405c-ac92-0e669cf89ac0","shared_citers":11}],"time_series":[{"n":4,"year":2024},{"n":6,"year":2025},{"n":97,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T02:24:09.045740+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T02:24:06.040166+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","claims":[{"claim_text":"We present CogVideoX, a large-scale text-to-video generation model based on diffusion transformer, which can generate 10-second continuous videos aligned with text prompt, with a frame rate of 16 fps and resolution of 768 * 1360 pixels. Previous video generation models often had limited movement and short durations, and is difficult to generate videos with coherent narratives based on text. We propose several designs to address these issues. First, we propose a 3D Variational Autoencoder (VAE) to compress videos along both spatial and temporal dimensions, to improve both compression rate and v","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:24:08.923061+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","claims":[{"claim_text":"We present CogVideoX, a large-scale text-to-video generation model based on diffusion transformer, which can generate 10-second continuous videos aligned with text prompt, with a frame rate of 16 fps and resolution of 768 * 1360 pixels. Previous video generation models often had limited movement and short durations, and is difficult to generate videos with coherent narratives based on text. We propose several designs to address these issues. First, we propose a 3D Variational Autoencoder (VAE) to compress videos along both spatial and temporal dimensions, to improve both compression rate and v","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:24:08.925754+00:00"}},"summary":{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","claims":[{"claim_text":"We present CogVideoX, a large-scale text-to-video generation model based on diffusion transformer, which can generate 10-second continuous videos aligned with text prompt, with a frame rate of 16 fps and resolution of 768 * 1360 pixels. Previous video generation models often had limited movement and short durations, and is difficult to generate videos with coherent narratives based on text. We propose several designs to address these issues. First, we propose a 3D Variational Autoencoder (VAE) to compress videos along both spatial and temporal dimensions, to improve both compression rate and v","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":73},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":52},{"title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","work_id":"881efa7e-7e73-4c66-9cc3-2803e551061c","shared_citers":48},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":29},{"title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning","work_id":"1f9d1d3b-a6d6-45a9-9f13-51393c03be8a","shared_citers":19},{"title":"Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow","work_id":"a1989e1b-d66d-4533-be3a-fb9c5fd62290","shared_citers":17},{"title":"Open-Sora: Democratizing Efficient Video Production for All","work_id":"8b29ba7b-3d84-4281-85b7-9eaf905afd7f","shared_citers":17},{"title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","work_id":"2dbd6bcd-fc98-4fbf-b586-f6d94fe1abd2","shared_citers":16},{"title":"Movie Gen: A Cast of Media Foundation Models","work_id":"a6a118b0-002f-4b19-881f-7f1183e0d7d8","shared_citers":16},{"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","shared_citers":14},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":14},{"title":"Seedance 1.0: Exploring the Boundaries of Video Generation Models","work_id":"b2e36b5d-99e4-45b4-9358-64f6d3501983","shared_citers":14},{"title":"Self Forcing: Bridging the Train-Test Gap in Autoregressive Video Diffusion","work_id":"53e58ef9-7932-4b83-b757-34ac14db3e0f","shared_citers":14},{"title":"CameraCtrl: Enabling Camera Control for Text-to-Video Generation","work_id":"1c05c278-c023-4ef0-a359-25a41f1065eb","shared_citers":13},{"title":"LTX-Video: Realtime Video Latent Diffusion","work_id":"cee5c521-3ce9-466e-a035-1e42f89254f4","shared_citers":13},{"title":"Qwen-Image Technical Report","work_id":"d06d7ecc-7579-4f89-a60b-4278a0f3c562","shared_citers":13},{"title":"Towards Accurate Generative Models of Video: A New Metric & Challenges","work_id":"72f42543-17d5-49aa-ba5a-25d67ffbb88a","shared_citers":13},{"title":"Classifier-Free Diffusion Guidance","work_id":"acf2c588-c088-4a6c-938e-150ad7c666d7","shared_citers":12},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":12},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":12},{"title":"Make-A-Video: Text-to-Video Generation without Text-Video Data","work_id":"52a801fc-a707-45a1-a8cd-0d6702f124ab","shared_citers":12},{"title":"ModelScope Text-to-Video Technical Report","work_id":"1b1baf78-58ec-44d0-b700-84dff57b2f1f","shared_citers":12},{"title":"Score-Based Generative Modeling through Stochastic Differential Equations","work_id":"d9110e53-a5d4-4794-a4c5-a575e91c31ad","shared_citers":12},{"title":"Denoising Diffusion Implicit Models","work_id":"8fa2128b-d18c-405c-ac92-0e669cf89ac0","shared_citers":11}],"time_series":[{"n":4,"year":2024},{"n":6,"year":2025},{"n":97,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"9678cb7f-19e8-4fdc-b9ab-ad5b320b3877","orcid":null,"display_name":"Jiayan Teng","source":"manual","import_confidence":0.72},{"id":"725cfe56-37fc-4242-a2f6-390c93cafb01","orcid":null,"display_name":"Jiazheng Xu","source":"manual","import_confidence":0.72},{"id":"7d458b38-c708-43b9-95e2-b5fa30e5ffef","orcid":null,"display_name":"Ming Ding","source":"manual","import_confidence":0.72},{"id":"95ab0776-6cb0-41a1-938f-9aeb8e23c742","orcid":null,"display_name":"Shiyu Huang","source":"manual","import_confidence":0.72},{"id":"bb6b9074-fec5-4625-8a3d-e5d9bb5fa1fb","orcid":null,"display_name":"Wendi Zheng","source":"manual","import_confidence":0.72},{"id":"633203cd-6619-41f4-b0ce-0af13a309da0","orcid":null,"display_name":"Zhuoyi Yang","source":"manual","import_confidence":0.72}]}}