{"total":17,"items":[{"citing_arxiv_id":"2607.01918","ref_index":164,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Zeus: Towards Tuning-Free Foundation Model for Time Series Analysis","primary_cat":"cs.LG","submitted_at":"2026-07-02T09:16:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Zeus proposes a multi-scale Transformer with point-wise tokenization and Multi-Objective Temporal Masking to enable tuning-free performance on forecasting, interpolation, and other time series tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31734","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MemLearner: Learning to Query Context memory for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-06-30T14:31:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemLearner introduces a learning-based adaptive context query method using query tokens in video world models to improve long-term scene consistency over rule-based retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29570","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Hierarchical Policy Learning via Spectral Decomposition","primary_cat":"cs.RO","submitted_at":"2026-06-28T19:22:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Causal Spectral Policy decomposes actions spectrally into coarse motion from obs/language and conditional fine corrections, outperforming baselines on precision manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18390","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Vision Foundation Models as Generalist Tokenizers for Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T13:38:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VFMTok builds a generalist image tokenizer on frozen VFMs using adaptive quantization and semantic alignment, delivering gFID 1.36 for autoregressive and 1.25 for continuous generation on ImageNet with 3x faster convergence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18267","ref_index":23,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SRC-Flow: Compact Semantic Representations Enable Normalizing Flows for Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T12:03:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SRC-Flow compresses RAE features via a Semantic Representation Compressor into a low-dimensional space, enabling normalizing flows to reach gFID 1.65 on ImageNet 256x256 and 2.07 on 512x512 while retaining exact likelihoods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12011","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CaloArt: Large-Patch x-Prediction Diffusion Transformers for High-Granularity Calorimeter Shower Generation","primary_cat":"physics.ins-det","submitted_at":"2026-05-12T12:00:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CaloArt achieves top FPD, high-level, and classifier metrics on CaloChallenge datasets 2 and 3 while keeping single-GPU generation at 9-11 ms per shower by combining large-patch tokenization, x-prediction, and conditional flow matching.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"pytorch. [59] T. Karras, M. Aittala, T. Aila and S. Laine,Elucidating the Design Space of Diffusion-Based Generative Models, inAdvances in Neural Information Processing Systems, vol. 35, pp. 26565-26577, 2022 arXiv:2206.00364. [60] J.C. Butcher,Numerical Methods for Ordinary Differential Equations, John Wiley & Sons, 2 ed. (2008), 10.1002/9780470753767. [61] T. Li, Y. Tian, H. Li, M. Deng and K. He,Autoregressive Image Generation without Vector Quantization, inAdvances in Neural Information Processing Systems, vol. 37, 2024, DOI arXiv:2406.11838. [62] K. Miettinen,Nonlinear Multiobjective Optimization, vol. 12 ofInternational Series in Operations Research & Management Science, Kluwer Academic Publishers (1999),"},{"citing_arxiv_id":"2605.16384","ref_index":90,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mutual Enhancement Between Global Tokens and Patch Tokens: From Theory to Practice","primary_cat":"cs.CV","submitted_at":"2026-05-11T10:51:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TaTok is a theoretically grounded adaptive tokenization method that uses global tokens and cumulative conditional entropy filtering to reduce redundancy while improving reconstruction quality over fixed-rate patch tokenization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07230","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CASCADE: Context-Aware Relaxation for Speculative Image Decoding","primary_cat":"cs.CV","submitted_at":"2026-05-08T04:32:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CASCADE formalizes semantic interchangeability and convergence in target model representations to enable context-aware acceptance relaxation in tree-based speculative decoding, delivering up to 3.6x speedup on text-to-image models without quality loss.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"to diffusion [17] and generative adversarial network [12] based approaches. Compared to diffusion models [ 17, 35], AR offers flexible resolution control and seamless multi-modality integration. To alleviate the sequential generation bottleneck of AR image generation models, prior works have explored coarse-to-fine decoding [41], as well as masked parallel decoding strategies such as [23, 5] for continuous and discrete image tokens, respectively. More recent approaches introduce parallelism by explicitly relaxing causal dependencies. [46] achieves this by breaking AR constraints for class- conditioned generation, but relies on training. Similarly, [14] departs from strict raster-order decoding using a heuristic windowing scheme, though its applicability to text-conditioned image generation"},{"citing_arxiv_id":"2604.18471","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NI Sampling: Accelerating Discrete Diffusion Sampling by Token Order Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:22:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NI Sampling accelerates discrete diffusion language models up to 14.3 times by training a neural indicator to select which tokens to sample at each step using a trajectory-preserving objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09168","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ELT: Elastic Looped Transformers for Visual Generation","primary_cat":"cs.CV","submitted_at":"2026-04-10T09:53:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Elastic Looped Transformers share weights across recurrent blocks and apply intra-loop self-distillation to deliver 4x parameter reduction while matching competitive FID and FVD scores on ImageNet and UCF-101.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Ccvs: Context-aware controllable video synthesis. Advances in Neural Information Processing Systems, 34:14042-14055, 2021. [44] T. Li, Y. Tian, H. Li, M. Deng, and K. He. Autoregressive image generation without vector quantization, 2024. URLhttps://arxiv.org/abs/2406.11838. [45] Y. Li. Mor-vit: Efficient vision transformer with mixture-of-recursions, 2025. URLhttps: //arxiv.org/abs/2507.21761. [46] I. Loshchilov and F. Hutter. Decoupled weight decay regularization, 2019. URLhttps:// arxiv.org/abs/1711.05101. [47] S. McCallum, K. Arora, and J. Foster. Reversible deep equilibrium models, 2025. URLhttps: //arxiv.org/abs/2509.12917. [48] G. Menghani. Efficient deep learning: A survey on making deep learning models smaller, faster, and better.ACM Computing Surveys, 55(12):1-37, 2023."},{"citing_arxiv_id":"2506.15564","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Show-o2: Improved Native Unified Multimodal Models","primary_cat":"cs.CV","submitted_at":"2025-06-18T15:39:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Show-o2 unifies text, image, and video understanding and generation in a single autoregressive-plus-flow-matching model built on 3D causal VAE representations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Autoregressive image generation with randomized parallel decoding.arXiv preprint arXiv:2503.10568, 2025. [60] Qingyun Li, Zhe Chen, Weiyun Wang, Wenhai Wang, Shenglong Ye, Zhenjiang Jin, et al. Omnicorpus: A unified multimodal corpus of 10 billion-level images interleaved with text. InThe Thirteenth International Conference on Learning Representations, 2025. [61] Tianhong Li, Yonglong Tian, He Li, Mingyang Deng, and Kaiming He. Autoregressive image generation without vector quantization.arXiv preprint arXiv:2406.11838, 2024. [62] Xiaotong Li, Fan Zhang, Haiwen Diao, Yueze Wang, Xinlong Wang, and Ling-Yu Duan. Densefusion-1m: Merging vision experts for comprehensive multimodal perception.2407.08303, 2024. [63] Zijie Li, Henry Li, Yichun Shi, Amir Barati Farimani, Yuval Kluger, Linjie Yang, and Peng Wang."},{"citing_arxiv_id":"2505.11334","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MARRS: Masked Autoregressive Unit-based Reaction Synthesis","primary_cat":"cs.CV","submitted_at":"2025-05-16T15:00:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MARRS synthesizes fine-grained reaction motions via unit-distinguished VAE, masked action-conditioned fusion, mutual unit modulation, and compact MLP diffusion predictors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.00200","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Unified Video Action Model","primary_cat":"cs.RO","submitted_at":"2025-02-28T21:38:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UVA learns a joint video-action latent representation with decoupled diffusion decoding heads, enabling a single model to perform accurate fast policy learning, forward/inverse dynamics, and video generation without performance loss versus task-specific methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[26] Po-Chen Ko, Jiayuan Mao, Yilun Du, Shao-Hua Sun, and Joshua B Tenenbaum. Learning to act from actionless videos through dense correspondences. arXiv preprint arXiv:2310.08576, 2023. [27] Tianhong Li, Yonglong Tian, He Li, Mingyang Deng, and Kaiming He. Autoregressive Image Generation Without Vector Quantization. arXiv preprint arXiv:2406.11838 , 2024. [28] Junbang Liang, Ruoshi Liu, Ege Ozguroglu, Sruthi Sud- hakar, Achal Dave, Pavel Tokmakov, Shuran Song, and Carl V ondrick. Dreamitate: Real-World Visuomotor Pol- icy Learning via Video Generation. CoRL, 2024. [29] Fanqi Lin, Yingdong Hu, Pingyue Sheng, Chuan Wen, Jiacheng You, and Yang Gao. Data scaling laws in im- itation learning for robotic manipulation."},{"citing_arxiv_id":"2502.00816","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sundial: A Family of Highly Capable Time Series Foundation Models","primary_cat":"cs.LG","submitted_at":"2025-02-02T14:52:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sundial uses TimeFlow Loss for native pre-training of Transformers on continuous time series from TimeBench, achieving SOTA point and probabilistic forecasting with millisecond inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.17891","ref_index":150,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scaling Diffusion Language Models via Adaptation from Autoregressive Models","primary_cat":"cs.CL","submitted_at":"2024-10-23T14:04:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adapting autoregressive models via continual pre-training yields diffusion language models from 127M to 7B parameters that outperform prior diffusion models and compete with their autoregressive counterparts on language, reasoning, and commonsense benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.13848","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2024-10-17T17:58:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Janus decouples visual encoding into task-specific pathways inside a single autoregressive transformer to unify multimodal understanding and generation while outperforming earlier unified models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.06885","ref_index":117,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching","primary_cat":"eess.AS","submitted_at":"2024-10-09T13:46:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"F5-TTS generates natural speech from text via flow matching on DiT with simple text padding, ConvNeXt refinement, and sway sampling, trained on 100K hours multilingual data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}