{"total":28,"items":[{"citing_arxiv_id":"2606.29814","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nemotron-Labs-Diffusion-Image: Advancing Masked Discrete Diffusion for High-Resolution Image Synthesis","primary_cat":"cs.CV","submitted_at":"2026-06-29T05:48:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A masked discrete diffusion model adds token editing at inference and grouped cross-entropy training to reach 0.90 GenEval, 86.9 DPG, and 10.76 HPSv3 scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25331","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improved Large Language Diffusion Models","primary_cat":"cs.CL","submitted_at":"2026-06-24T02:51:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"iLLaDA is an 8B masked diffusion LM trained from scratch with bidirectional attention, reporting gains of 14-21 points on BBH, ARC, MATH and HumanEval over prior diffusion models while remaining competitive with Qwen2.5-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19534","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PerceptionDLM: Parallel Region Perception with Multimodal Diffusion Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-17T19:27:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PerceptionDLM enables parallel region captioning in multimodal diffusion language models via prompting and attention masking, introduces ParaDLC-Bench, and claims first parallel region perception with DLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10537","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prefilling-dLLM: Predictive Prefilling for Long-Context Inference in Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-09T08:06:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Prefilling-dLLM partitions prefixes into chunks, caches KV representations, and applies sparse top-K selection during decoding to cut dLLM inference complexity to quadratic in decode length only.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04535","ref_index":82,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Infilling Anchors for Format-Constrained Generation in Diffusion Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-03T07:18:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DIA is a training-free method that dynamically adjusts anchor positions in diffusion LLMs to improve format compliance and accuracy on reasoning benchmarks like GSM8K and MATH.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30876","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"dMoE: dLLMs with Learnable Block Experts","primary_cat":"cs.CL","submitted_at":"2026-05-29T06:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dMoE aggregates token expert distributions to block level in dLLMs, cutting unique experts from 69.5 to 14.6, memory by 76-80%, and latency by 1.14-1.66x while retaining 99.11% performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29488","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnyMo: Scaling Any-Modality Conditional Motion Generation with Masked Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-28T07:15:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AnyMo is a masked-modeling framework for any-modality human motion generation trained on the new OmniHuMo dataset of 5,000+ hours of multimodal motion sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25820","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Visual-Redundancy-Controlled Parallel Decoding for Diffusion-Based Multimodal Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-25T13:16:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VRCD prioritizes visually complementary positions during parallel decoding in dMLLMs by measuring attention overlap with the new Visual Redundancy Index, yielding accuracy gains over confidence-based baselines on M^3CoT and MMBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23163","ref_index":23,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fast-dDrive: Efficient Block-Diffusion VLM for Autonomous Driving","primary_cat":"cs.CL","submitted_at":"2026-05-22T02:31:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fast-dDrive is a block-diffusion VLA that reports SOTA accuracy on WOD-E2E and nuScenes driving benchmarks together with 12x throughput over autoregressive baselines via section scaffolds and test-time averaging.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16842","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sketch Then Paint: Hierarchical Reinforcement Learning for Diffusion Multi-Modal Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-16T06:59:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes HT-GRPO with sketch-then-paint staged updates, prompt-conditioned importance ratios, and hierarchical credit assignment for dMLLMs, reporting gains on GenEval and DPG plus quality metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14530","ref_index":23,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mitigating Mask Prior Drift and Positional Attention Collapse in Large Diffusion Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-14T08:11:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Diagnoses mask prior drift and positional attention collapse in LDVLMs and introduces two plug-and-play decoding interventions that raise long-form generation quality without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12624","ref_index":60,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MindVLA-U1: VLA Beats VA with Unified Streaming Architecture for Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-05-12T18:09:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MindVLA-U1 is the first unified streaming VLA architecture that surpasses human drivers on WOD-E2E planning metrics while matching VA latency and preserving language interfaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10938","ref_index":78,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ELF: Embedded Language Flows","primary_cat":"cs.CL","submitted_at":"2026-05-11T17:59:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ELF applies continuous-time flow matching in embedding space for language generation and reports outperforming prior discrete and continuous diffusion language models with fewer steps.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[76] Jiacheng Ye, Zhihui Xie, Lin Zheng, Jiahui Gao, Zirui Wu, Xin Jiang, Zhenguo Li, and Lingpeng Kong. Dream 7b: Diffusion large language models.arXiv preprint arXiv:2508.15487, 2025. 1, 3 [77] Jiasheng Ye, Zaixiang Zheng, Yu Bao, Lihua Qian, and Mingxuan Wang. DINOISER: Diffused conditional sequence learning by manipulating noises.Transactions of the Association for Computational Linguistics, 2024. 2, 15 [78] Zebin You, Shen Nie, Xiaolu Zhang, Jun Hu, Jun Zhou, Zhiwu Lu, Ji-Rong Wen, and Chongxuan Li. Llada-v: Large language diffusion models with visual instruction tuning.arXiv preprint arXiv:2505.16933, 2025. 3 [79] Hongyi Yuan, Zheng Yuan, Chuanqi Tan, Fei Huang, and Songfang Huang. Seqdiffuseq: Text diffusion with encoder-decoder transformers. InNAACL, 2024."},{"citing_arxiv_id":"2605.10218","ref_index":99,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Relative Score Policy Optimization for Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-11T08:58:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RSPO interprets reward advantages as targets for relative log-ratios in dLLMs, calibrating noisy estimates to stabilize RLVR training and achieve strong gains on planning tasks with competitive math reasoning performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07399","ref_index":5,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GPO-V: Jailbreak Diffusion Vision Language Model by Global Probability Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-08T07:54:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GPO-V jailbreaks dVLMs by globally optimizing probabilities in the denoising process to bypass refusal patterns, achieving stealthy and transferable attacks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"paradigms to mitigate the unique risks of diffusion-based generation. Our code is available at: https://anonymous.4open.science/r/GPO-V-0250. 1 Introduction Diffusion Large Language Models (dLLMs) [ 1, 2] have emerged as a prominent generative paradigm parallel to traditional autoregressive models [3, 4]. Building upon this framework, Diffusion Vision-Language Models (dVLMs) [5] have achieved outstanding performance in sophisticated image understanding and multimodal tasks. However, a significant disparity exists in their safety development: while autoregressive models have undergone extensive safety alignment to mitigate adversarial risks, the security landscape of dVLMs remains largely unexplored. This reveals a critical research gap concerning the vulnerability of dVLMs to jailbreak attacks, representing an urgent security"},{"citing_arxiv_id":"2605.06548","ref_index":105,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Continuous Latent Diffusion Language Model","primary_cat":"cs.CL","submitted_at":"2026-05-07T16:44:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Cola DLM proposes a hierarchical latent diffusion model that learns a text-to-latent mapping, fits a global semantic prior in continuous space with a block-causal DiT, and performs conditional decoding, establishing latent prior modeling as an alternative to token-level autoregressive language model","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"autoregression: Discrete diffusion for complex reasoning and planning.arXiv preprint arXiv:2410.14157, 2024. [104] Haoran You, Yichao Fu, Zheng Wang, Amir Yazdanbakhsh, and Yingyan Celine Lin. When linear attention meets autoregressive decoding: Towards more effective and efficient linearized large language models.arXiv preprint arXiv:2406.07368, 2024. [105] Zebin You, Shen Nie, Xiaolu Zhang, Jun Hu, Jun Zhou, Zhiwu Lu, Ji-Rong Wen, and Chongxuan Li. Llada-v: Large language diffusion models with visual instruction tuning.arXiv preprint arXiv:2505.16933, 2025. [106] Runpeng Yu, Xinyin Ma, and Xinchao Wang. Dimple: Discrete diffusion multimodal large language model with parallel decoding. arXiv preprint arXiv:2505."},{"citing_arxiv_id":"2604.18839","ref_index":173,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"One Step Forward and K Steps Back: Better Reasoning with Denoising Recursion Models","primary_cat":"cs.LG","submitted_at":"2026-04-20T21:06:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Denoising Recursion Models train multi-step noise reversal in looped transformers and outperform the prior Tiny Recursion Model on ARC-AGI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17068","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Stability-Weighted Decoding for Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-18T17:04:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stability-Weighted Decoding improves diffusion LLM accuracy by modulating token scores with temporal stability from KL divergence between prediction steps.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"We now relate this information gain to the token's total dependency on the unknown contexts. Theorem A.2.The expected temporal instability of token xi 0 is a strict lower bound on its mutual information with the total masked contextU t+1. That is, high instability implies high dependency on the remaining unknowns. I(x i 0;U t+1 |x t+1)≥E h D(i) temp i .(16) Proof. The total set of unknowns at step t+ 1 , denoted Ut+1, consists of the information revealed in the current step (xt) and the information remaining masked (Ut). Information Content(Ut+1)≡Information Content(x t,U t)(17) By the Chain Rule of Mutual Information, we decompose the total dependency: I(x i 0;U t+1 |x t+1) =I(x i 0;x t,U t |x t+1)(18)"},{"citing_arxiv_id":"2604.16514","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BARD: Bridging AutoRegressive and Diffusion Vision-Language Models Via Highly Efficient Progressive Block Merging and Stage-Wise Distillation","primary_cat":"cs.CV","submitted_at":"2026-04-15T09:17:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BARD bridges autoregressive and diffusion VLMs with progressive block merging plus stage-wise intra-diffusion distillation, delivering 3x speedup and new SOTA on open dVLMs using under 4.4M data points.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Model Parameters MMMU val MMMU-Prostandard MMEsum RealWorldQA MMStar AI2D ChartQA AutoRegressive Vision-Language Models Qwen3-VL [4] 4B 47.9 35.0 2297 70.5 56.9 81.0 80.9 Qwen3-VL [4] 8B 53.0 36.0 2379 69.5 59.9 83.5 84.0 InternVL3.5 [19] 4B 57.4 38.2 2236 66.7 65.6 80.6 86.2 InternVL3.5 [19] 8B 57.2 41.0 2359 63.1 66.3 82.1 87.0 Diffusion Vision-Language Models LLaDA-V [26] 8B 48.8 35.4 1998 63.4 60.4 77.8 78.2 Dream-VL [25] 7B 51.6 25.0 2179 67.7 59.9 80.4 86.2 LaviDa [11] 8B 44.2 28.6 1711 40.3 47.0 70.1 64.6 SDAR-VL [5] 8B 44.0 28.2 2142 66.1 53.3 79.6 82.4 MMaDA [23] 8B 30.2 21.5 1287 28.2 25.7 54.9 43.2 Dimple-VL [27] 7B 46.4 24.1 1924 51.9 47.7 74.2 58.4 BARD-VL Converted from Qwen3-VL BARD-VL(𝐵=32)2B 42.0 27.9 2045 64."},{"citing_arxiv_id":"2604.13413","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Dataset-Level Metrics Attenuate Non-Determinism: A Fine-Grained Non-Determinism Evaluation in Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-15T02:31:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dataset-level metrics in diffusion language models mask substantial sample-level non-determinism that varies with model and system factors, which a new Factor Variance Attribution metric can decompose.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11052","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LaDA-Band: Language Diffusion Models for Vocal-to-Accompaniment Generation","primary_cat":"cs.SD","submitted_at":"2026-04-13T06:29:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LaDA-Band applies discrete masked diffusion with dual-track conditioning and progressive training to generate vocal-to-accompaniment tracks that improve acoustic authenticity, global coherence, and dynamic orchestration over prior baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Haohe Liu, Yiming Liang, Wenye Ma, Xingjian Du, et al . 2025. Yue: Scal- ing open foundation models for long-form music generation.arXiv preprint arXiv:2503.08638(2025). [55] Junan Zhang, Yunjia Zhang, Xueyao Zhang, and Zhizheng Wu. 2025. AnyAccomp: Generalizable Accompaniment Generation via Quantized Melodic Bottleneck. arXiv preprint arXiv:2509.14052(2025). [56] Jingwei Zhao and Gus Xia. 2021. Accomontage: Accompaniment arrangement via phrase selection and style transfer.arXiv preprint arXiv:2108.11213(2021). [57] Jiaming Zhou, Hongjie Chen, Shiwan Zhao, Jian Kang, Jie Li, Enzhi Wang, Yujie Guo, Haoqin Sun, Hui Wang, Aobo Kong, et al. 2025. DIFFA: Large Language Diffusion Models Can Listen and Understand."},{"citing_arxiv_id":"2604.09450","ref_index":57,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ECHO: Efficient Chest X-ray Report Generation with One-step Block Diffusion","primary_cat":"cs.LG","submitted_at":"2026-04-10T16:07:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ECHO introduces one-step block diffusion via Direct Conditional Distillation and Response-Asymmetric Diffusion to generate chest X-ray reports faster than autoregressive models while improving clinical metrics.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[55] Jiacheng Ye, Zhihui Xie, Lin Zheng, Jiahui Gao, Zirui Wu, Xin Jiang, Zhenguo Li, and Lingpeng Kong. Dream 7b: Diffusion large language models.arXiv preprint arXiv:2508.15487, 2025. [56] Jaehoon Yoo, Wonjung Kim, and Seunghoon Hong. Redi: Rectified discrete flow. InThe Thirty-ninth Annual Conference on Neural Information Processing Systems, 2025. [57] Zebin You, Shen Nie, Xiaolu Zhang, Jun Hu, Jun Zhou, Zhiwu Lu, Ji-Rong Wen, and Chongxuan Li. Llada-v: Large language diffusion models with visual instruction tuning.arXiv preprint arXiv:2505.16933, 2025. [58] Runpeng Yu, Xinyin Ma, and Xinchao Wang. Dimple: Discrete diffusion multimodal large language model with parallel decoding.arXiv preprint arXiv:2505."},{"citing_arxiv_id":"2604.08302","ref_index":94,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DMax: Aggressive Parallel Decoding for dLLMs","primary_cat":"cs.LG","submitted_at":"2026-04-09T14:35:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DMax uses On-Policy Uniform Training and Soft Parallel Decoding to enable aggressive parallelism in dLLMs, raising TPF on GSM8K from 2.04 to 5.47 and on MBPP from 2.71 to 5.86 while preserving accuracy.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Building on this formulation, LLaDA [58] and Dream [92] scale MDLMs to the billion- parameter regime with large-scale pretraining, demonstrating their practical potential. LLaDA-2.0 [10] and LLaDA-MoE [110] further show that MDLMs can be effectively scaled with mixture-of- experts architectures. Beyond these developments, dLLMs are also attracting increasing attention in reasoning [109, 59, 86, 63, 74, 57, 103], multimodal tasks [94, 96, 90, 91, 48, 82, 97, 18], code generation [87, 24, 21], long-context modeling [47, 28, 106], and agent [104, 102]. Accelerating Diffusion Language Models.dLLMs are viewed as promising due to their potential for low-cost inference, yet their efficiency remains largely underexplored. Existing efforts improve efficiency from several perspectives. Some methods reduce the cost of each decoding step through"},{"citing_arxiv_id":"2604.05497","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Thinking Diffusion: Penalize and Guide Visual-Grounded Reasoning in Diffusion Multimodal Language Models","primary_cat":"cs.AI","submitted_at":"2026-04-07T06:41:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Position and step penalty plus visual reasoning guidance fix premature answering and weak visual grounding in diffusion MLLMs, delivering up to 7.5% accuracy gains and over 3x speedup.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[40] Ling Yang, Ye Tian, Bowen Li, Xinchen Zhang, Ke Shen, Yunhai Tong, and Mengdi Wang. Mmada: Mul- timodal large diffusion language models.arXiv preprint arXiv:2505.15809, 2025. 1, 6 [41] Jiacheng Ye, Zhihui Xie, Lin Zheng, Jiahui Gao, Zirui Wu, Xin Jiang, Zhenguo Li, and Lingpeng Kong. Dream 7b: Diffusion large language models.arXiv preprint arXiv:2508.15487, 2025. 1, 2 [42] Zebin You, Shen Nie, Xiaolu Zhang, Jun Hu, Jun Zhou, Zhiwu Lu, Ji-Rong Wen, and Chongxuan Li. Llada-v: Large language diffusion models with visual instruction tuning. arXiv preprint arXiv:2505.16933, 2025. 1 [43] Runpeng Yu, Xinyin Ma, and Xinchao Wang. Dimple: Dis- crete diffusion multimodal large language model with paral- lel decoding.arXiv preprint arXiv:2505."},{"citing_arxiv_id":"2512.19433","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"dMLLM-TTS: Self-Verified and Efficient Test-Time Scaling for Diffusion Multi-Modal Large Language Models","primary_cat":"cs.CV","submitted_at":"2025-12-22T14:31:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"dMLLM-TTS delivers up to 6x more efficient test-time scaling for diffusion MLLMs via O(N+T) hierarchical search and self-verified feedback, improving generation quality on GenEval across three models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.14067","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient-DLM: From Autoregressive to Diffusion Language Models, and Beyond in Speed","primary_cat":"cs.CL","submitted_at":"2025-12-16T04:12:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Efficient-DLM converts AR models to dLMs via block-wise causal attention and position-dependent masking, yielding higher accuracy and 2.7-4.5x throughput than Dream 7B and Qwen3 4B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.06133","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CreditDecoding: Accelerating Parallel Decoding in Diffusion Large Language Models with Trace Credit","primary_cat":"cs.CL","submitted_at":"2025-10-07T17:08:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.22618","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Fast-dLLM: Training-free Acceleration of Diffusion LLM by Enabling KV Cache and Parallel Decoding","primary_cat":"cs.CL","submitted_at":"2025-05-28T17:39:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Fast-dLLM adds reusable KV cache blocks and selective parallel decoding to diffusion LLMs, closing most of the speed gap with autoregressive models without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}