{"total":18,"items":[{"citing_arxiv_id":"2606.09156","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniGen-AR: AutoRegressive Any-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2026-06-08T07:47:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniGen-AR is a unified autoregressive framework for any-to-image generation that tokenizes text and visual conditions together and uses disentangled causal attention to support tasks like text-to-image, depth-to-image, image editing, and text-to-video while reporting 0.63 on GenEval and 80.02 on VB","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07508","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Streaming Video Generation with Streaming Force Control","primary_cat":"cs.CV","submitted_at":"2026-06-05T17:57:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StreamForce presents a unified causal model for force-controllable streaming video generation using a new force representation and distillation pipeline, claiming SOTA force adherence and 16.6 FPS performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15141","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Causal Forcing++: Scalable Few-Step Autoregressive Diffusion Distillation for Real-Time Interactive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:46:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Causal Forcing++ applies causal consistency distillation to enable scalable frame-wise 1-2 step autoregressive video generation, outperforming prior 4-step chunk-wise methods on quality metrics while halving first-frame latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14487","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Head Forcing: Long Autoregressive Video Generation via Head Heterogeneity","primary_cat":"cs.CV","submitted_at":"2026-05-14T07:27:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Head Forcing assigns tailored KV cache strategies to local, anchor, and memory attention heads plus head-wise RoPE re-encoding to extend autoregressive video generation from seconds to minutes without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.07775","ref_index":94,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rolling Sink: Bridging Limited-Horizon Training and Open-Ended Testing in Autoregressive Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-02-08T02:16:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rolling Sink is a training-free cache adjustment technique that maintains visual consistency in autoregressive video diffusion models for ultra-long open-ended generation beyond training horizons.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.02214","ref_index":39,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Causal Forcing: Autoregressive Diffusion Distillation Done Right for High-Quality Real-Time Interactive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-02-02T15:19:22+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.25161","ref_index":98,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rolling Forcing: Autoregressive Long Video Diffusion in Real Time","primary_cat":"cs.CV","submitted_at":"2025-09-29T17:57:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rolling Forcing generates multi-minute videos in real time by jointly denoising frames at increasing noise levels, anchoring attention to early frames, and using windowed distillation to limit error accumulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.21996","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VRAG: Learning World Models for Interactive Video Generation","primary_cat":"cs.CV","submitted_at":"2025-05-28T05:55:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.00200","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Unified Video Action Model","primary_cat":"cs.RO","submitted_at":"2025-02-28T21:38:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UVA learns a joint video-action latent representation with decoupled diffusion decoding heads, enabling a single model to perform accurate fast policy learning, forward/inverse dynamics, and video generation without performance loss versus task-specific methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"mad Taghi Saffar, Santiago Castro, Julius Kunze, and Dumitru Erhan. Phenaki: Variable Length Video Gen- eration from Open Domain Textual Descriptions. In International Conference on Learning Representations , 2022. [46] Dirk Weissenborn, Oscar T ¨ackstr¨om, and Jakob Uszkor- eit. Scaling Autoregressive Video Models. arXiv preprint arXiv:1906.02634, 2019. [47] Wenming Weng, Ruoyu Feng, Yanhui Wang, Qi Dai, Chunyu Wang, Dacheng Yin, Zhiyuan Zhao, Kai Qiu, Jianmin Bao, Yuhui Yuan, et al. ART-V: Auto-Regressive Text-to-Video Generation with Diffusion Models. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pages 7395-7405, 2024. [48] Philipp Wu, Arjun Majumdar, Kevin Stone, Yixin Lin,"},{"citing_arxiv_id":"2310.06114","ref_index":121,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning Interactive Real-World Simulators","primary_cat":"cs.AI","submitted_at":"2023-10-09T19:42:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniSim learns a universal real-world simulator from orchestrated diverse datasets, enabling zero-shot deployment of policies trained purely in simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.13221","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent Video Diffusion Models for High-Fidelity Long Video Generation","primary_cat":"cs.CV","submitted_at":"2022-11-23T18:58:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Latent-space hierarchical diffusion models with targeted error-correction techniques generate realistic videos exceeding 1000 frames while using less compute than prior pixel-space approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2207.05221","ref_index":116,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Language Models (Mostly) Know What They Know","primary_cat":"cs.CL","submitted_at":"2022-07-11T22:59:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Language models show good calibration when asked to estimate the probability that their own answers are correct, with performance improving as models get larger.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2205.15868","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers","primary_cat":"cs.CV","submitted_at":"2022-05-29T19:02:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CogVideo is a large-scale transformer pretrained for text-to-video generation that outperforms public models in evaluations.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"FVD testing is the reconstruction result of the tokenizer. Method IS ( ↑) FVD ( ↓) VideoGPT[36] 24.69 - DVD-GAN[4] 27.38 - TGANv2[20]* 28.87 1209 MoCoGAN-HD[24] 32.36 838 DIGAN[37]* 29.71 655 DIGAN[37] 32.70 577 TATS-base[9] 79.28 332 CogVideo (Ours) 50.46 626 CogVideo (Ours)** - 545 Method FVD( ↓) Latent Video Tranformer[17] 224.73 Video Transformer[33] 170 DVD-GAN-FP[4] 69.15 TriVD-GAN-FP[15] 25.74 CogVideo (Ours) 109.23 CogVideo (Ours)** 59.55 5 Experiments 5.1 Machine Evaluation Machine evaluation is conducted on two popular benchmarks for video generation, i.e., UCF101 [22] and Kinetics-600 [3]. Following Rakhimov et al. [17], Yu et al. [37], we use Fréchet Video Distance (FVD) [27] and Inception score (IS) [21] as metrics in the evaluation."},{"citing_arxiv_id":"2112.00861","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A General Language Assistant as a Laboratory for Alignment","primary_cat":"cs.CL","submitted_at":"2021-12-01T22:24:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Ranked preference modeling outperforms imitation learning for language model alignment and scales more favorably with model size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2104.10157","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VideoGPT: Video Generation using VQ-VAE and Transformers","primary_cat":"cs.CV","submitted_at":"2021-04-20T17:58:03+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VideoGPT generates competitive natural videos by learning discrete latents with VQ-VAE and modeling them autoregressively with a transformer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2102.01293","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scaling Laws for Transfer","primary_cat":"cs.LG","submitted_at":"2021-02-02T04:07:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Effective data transferred from pre-training to fine-tuning is described by a power law in model parameter count and fine-tuning dataset size, acting like a multiplier on the fine-tuning data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2010.14701","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scaling Laws for Autoregressive Generative Modeling","primary_cat":"cs.LG","submitted_at":"2020-10-28T02:17:24+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Autoregressive transformers follow power-law scaling laws for cross-entropy loss with nearly universal exponents relating optimal model size to compute budget across four domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1910.11215","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RoboNet: Large-Scale Multi-Robot Learning","primary_cat":"cs.RO","submitted_at":"2019-10-24T15:20:03+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoboNet is a multi-robot video dataset that enables pre-training of vision-based manipulation models which, after fine-tuning on a new robot, outperform robot-specific training that uses 4-20 times more data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}