{"work":{"id":"2dbd6bcd-fc98-4fbf-b586-f6d94fe1abd2","openalex_id":null,"doi":null,"arxiv_id":"2205.15868","raw_key":null,"title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","authors":null,"authors_text":"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, Jie Tang","year":2022,"venue":"cs.CV","abstract":"Large-scale pretrained transformers have created milestones in text (GPT-3) and text-to-image (DALL-E and CogView) generation. Its application to video generation is still facing many challenges: The potential huge computation cost makes the training from scratch unaffordable; The scarcity and weak relevance of text-video datasets hinder the model understanding complex movement semantics. In this work, we present 9B-parameter transformer CogVideo, trained by inheriting a pretrained text-to-image model, CogView2. We also propose multi-frame-rate hierarchical training strategy to better align text and video clips. As (probably) the first open-source large-scale pretrained text-to-video model, CogVideo outperforms all publicly available models at a large margin in machine and human evaluations.","external_url":"https://arxiv.org/abs/2205.15868","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-14T19:39:23.459958+00:00","pith_arxiv_id":"2205.15868","created_at":"2026-05-09T05:55:31.731751+00:00","updated_at":"2026-05-14T19:39:23.459958+00:00","title_quality_ok":true,"display_title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","render_title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers"},"hub":{"state":{"work_id":"2dbd6bcd-fc98-4fbf-b586-f6d94fe1abd2","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":39,"external_cited_by_count":null,"distinct_field_count":5,"first_pith_cited_at":"2022-09-29T13:59:46+00:00","last_pith_cited_at":"2026-05-13T17:58:13+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-14T22:06:15.015057+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":2}],"polarity_counts":[{"context_polarity":"background","n":1},{"context_polarity":"unclear","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T17:49:55.226603+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":20},{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","work_id":"f38fc088-12aa-4bf4-9ecd-08d3e797ccb7","shared_citers":16},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":16},{"title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","work_id":"881efa7e-7e73-4c66-9cc3-2803e551061c","shared_citers":13},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":10},{"title":"Make-A-Video: Text-to-Video Generation without Text-Video Data","work_id":"52a801fc-a707-45a1-a8cd-0d6702f124ab","shared_citers":10},{"title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning","work_id":"1f9d1d3b-a6d6-45a9-9f13-51393c03be8a","shared_citers":9},{"title":"Imagen Video: High Definition Video Generation with Diffusion Models","work_id":"bb20d241-dc6f-4b0a-b071-fd43a2cbd57f","shared_citers":8},{"title":"Latte: Latent Diffusion Transformer for Video Generation","work_id":"5328e907-7278-4781-a2bb-c5ef40dc87fb","shared_citers":7},{"title":"Score-Based Generative Modeling through Stochastic Differential Equations","work_id":"d9110e53-a5d4-4794-a4c5-a575e91c31ad","shared_citers":7},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":7},{"title":"Towards Accurate Generative Models of Video: A New Metric & Challenges","work_id":"72f42543-17d5-49aa-ba5a-25d67ffbb88a","shared_citers":7},{"title":"Classifier-Free Diffusion Guidance","work_id":"acf2c588-c088-4a6c-938e-150ad7c666d7","shared_citers":6},{"title":"Denoising Diffusion Implicit Models","work_id":"8fa2128b-d18c-405c-ac92-0e669cf89ac0","shared_citers":6},{"title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","work_id":"0c6a768b-70b8-4242-bb0e-459f1008c9fc","shared_citers":6},{"title":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","work_id":"c81c5be2-0655-4234-a3e9-6c32753f136b","shared_citers":6},{"title":"Video Diffusion Models","work_id":"02e03469-549e-4b5a-9bf0-ac6617a89882","shared_citers":6},{"title":"arXiv:2210.02399 , year=","work_id":"a325cd53-6549-4726-b3e9-94509f0df168","shared_citers":5},{"title":"ModelScope Text-to-Video Technical Report","work_id":"1b1baf78-58ec-44d0-b700-84dff57b2f1f","shared_citers":5},{"title":"AgiBot World Colosseo: A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Systems","work_id":"f797e9ec-510f-43a7-8a0c-18009ce332e5","shared_citers":4},{"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","shared_citers":4},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":4},{"title":"Gen2act: Human video generation in novel scenarios enables generalizable robot manipulation","work_id":"a3bde288-aace-40db-8067-3ae6656f9509","shared_citers":4},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":4}],"time_series":[{"n":1,"year":2022},{"n":2,"year":2023},{"n":6,"year":2024},{"n":27,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T17:49:23.242034+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T17:49:45.304106+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","claims":[{"claim_text":"Large-scale pretrained transformers have created milestones in text (GPT-3) and text-to-image (DALL-E and CogView) generation. Its application to video generation is still facing many challenges: The potential huge computation cost makes the training from scratch unaffordable; The scarcity and weak relevance of text-video datasets hinder the model understanding complex movement semantics. In this work, we present 9B-parameter transformer CogVideo, trained by inheriting a pretrained text-to-image model, CogView2. We also propose multi-frame-rate hierarchical training strategy to better align te","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T17:49:23.245730+00:00"}},"summary":{"title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","claims":[{"claim_text":"Large-scale pretrained transformers have created milestones in text (GPT-3) and text-to-image (DALL-E and CogView) generation. Its application to video generation is still facing many challenges: The potential huge computation cost makes the training from scratch unaffordable; The scarcity and weak relevance of text-video datasets hinder the model understanding complex movement semantics. In this work, we present 9B-parameter transformer CogVideo, trained by inheriting a pretrained text-to-image model, CogView2. We also propose multi-frame-rate hierarchical training strategy to better align te","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":20},{"title":"CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer","work_id":"f38fc088-12aa-4bf4-9ecd-08d3e797ccb7","shared_citers":16},{"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","shared_citers":16},{"title":"HunyuanVideo: A Systematic Framework For Large Video Generative Models","work_id":"881efa7e-7e73-4c66-9cc3-2803e551061c","shared_citers":13},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":10},{"title":"Make-A-Video: Text-to-Video Generation without Text-Video Data","work_id":"52a801fc-a707-45a1-a8cd-0d6702f124ab","shared_citers":10},{"title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning","work_id":"1f9d1d3b-a6d6-45a9-9f13-51393c03be8a","shared_citers":9},{"title":"Imagen Video: High Definition Video Generation with Diffusion Models","work_id":"bb20d241-dc6f-4b0a-b071-fd43a2cbd57f","shared_citers":8},{"title":"Latte: Latent Diffusion Transformer for Video Generation","work_id":"5328e907-7278-4781-a2bb-c5ef40dc87fb","shared_citers":7},{"title":"Score-Based Generative Modeling through Stochastic Differential Equations","work_id":"d9110e53-a5d4-4794-a4c5-a575e91c31ad","shared_citers":7},{"title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","work_id":"8034c587-fba6-4941-87ba-c98f2ac962cb","shared_citers":7},{"title":"Towards Accurate Generative Models of Video: A New Metric & Challenges","work_id":"72f42543-17d5-49aa-ba5a-25d67ffbb88a","shared_citers":7},{"title":"Classifier-Free Diffusion Guidance","work_id":"acf2c588-c088-4a6c-938e-150ad7c666d7","shared_citers":6},{"title":"Denoising Diffusion Implicit Models","work_id":"8fa2128b-d18c-405c-ac92-0e669cf89ac0","shared_citers":6},{"title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","work_id":"0c6a768b-70b8-4242-bb0e-459f1008c9fc","shared_citers":6},{"title":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","work_id":"c81c5be2-0655-4234-a3e9-6c32753f136b","shared_citers":6},{"title":"Video Diffusion Models","work_id":"02e03469-549e-4b5a-9bf0-ac6617a89882","shared_citers":6},{"title":"arXiv:2210.02399 , year=","work_id":"a325cd53-6549-4726-b3e9-94509f0df168","shared_citers":5},{"title":"ModelScope Text-to-Video Technical Report","work_id":"1b1baf78-58ec-44d0-b700-84dff57b2f1f","shared_citers":5},{"title":"AgiBot World Colosseo: A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Systems","work_id":"f797e9ec-510f-43a7-8a0c-18009ce332e5","shared_citers":4},{"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","shared_citers":4},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":4},{"title":"Gen2act: Human video generation in novel scenarios enables generalizable robot manipulation","work_id":"a3bde288-aace-40db-8067-3ae6656f9509","shared_citers":4},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":4}],"time_series":[{"n":1,"year":2022},{"n":2,"year":2023},{"n":6,"year":2024},{"n":27,"year":2026}],"dependency_candidates":[]},"authors":[]}}