{"total":23,"items":[{"citing_arxiv_id":"2606.29228","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding Evaluation Illusion in Diffusion Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-28T06:31:36+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29094","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DiLaServe: High SLO Attainment Serving for Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-06-27T21:21:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiLaServe improves SLO attainment for diffusion language models by up to 56.6 percentage points and reduces latency by up to 46% with less than 1% accuracy drop via deadline-aware scheduling and dynamic reconfiguration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02544","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SimSD: Simple Speculative Decoding in Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-01T17:46:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SimSD adds a masking strategy to enable speculative decoding in diffusion LLMs, delivering up to 7.46x throughput gains on SDAR models while preserving generation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00724","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WaveFilter: Enhancing the Long-Context Capability of Diffusion LLMs via Wavelet-Guided KV Cache Filtering","primary_cat":"cs.CL","submitted_at":"2026-05-30T13:32:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WaveFilter applies wavelet decomposition to filter critical tokens for sparse KV caching, improving long-context performance of diffusion LLMs as a plug-and-play addition to existing methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30753","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Diffusion LLMs via Temporal-Spatial Parallel Decoding and Confidence Extrapolation","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:29:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces TSPD with a trajectory-feature controller and training-free CE to reduce denoising steps in dLLMs while aiming to preserve quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26120","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic-dLLM: Dynamic Cache-Budget and Adaptive Parallel Decoding for Training-Free Acceleration of Diffusion LLM","primary_cat":"cs.CL","submitted_at":"2026-05-27T02:47:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Dynamic-dLLM achieves over 3x average inference speedup on dLLMs like LLaDA-8B via adaptive cache budgets and decoding thresholds while preserving benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20813","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PulseCol: Periodically Refreshed Column-Sparse Attention for Accelerating Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T07:06:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PulseCol introduces periodically refreshed column-sparse attention to achieve up to 1.95x speedup over FlashAttention in diffusion LLMs with maintained model quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20022","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FlexDraft: Flexible Speculative Decoding via Attention Tuning and Bonus-Guided Calibration","primary_cat":"cs.CL","submitted_at":"2026-05-19T15:48:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlexDraft is a lossless speculative decoding framework that adapts to batch sizes via attention tuning on final layers, MLP-based bonus calibration, and dynamic parallel/sequential decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19470","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Drifting Objectives for Refining Discrete Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T07:22:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TokenDrift refines discrete diffusion language models by applying anti-symmetric drifting to soft-token features during training, yielding large reductions in generation perplexity at low NFEs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18165","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Elastic-dLLM: Position Preserving Context Compression and Augmentation of Diffusion LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-18T10:09:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Position-preserving MASK token compression reduces redundancy in diffusion LLMs to accelerate parallel decoding and enable context folding for longer sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16941","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Roll Out and Roll Back: Diffusion LLMs are Their Own Efficiency Teachers","primary_cat":"cs.CL","submitted_at":"2026-05-16T11:27:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Diffusion LLMs can act as their own efficiency teachers by using revokable parallel decoding to identify reliable token orders and then distilling those orders into the model parameters for faster inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13382","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BlockVLA: Accelerating Autoregressive VLA via Block Diffusion Finetuning","primary_cat":"cs.RO","submitted_at":"2026-05-13T11:37:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BlockVLA accelerates autoregressive VLA models by 3.3x using block diffusion finetuning, with faster training convergence and better early performance on long-horizon robotic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09536","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TAD: Temporal-Aware Trajectory Self-Distillation for Fast and Accurate Diffusion LLM","primary_cat":"cs.CL","submitted_at":"2026-05-10T13:38:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TAD improves the accuracy-parallelism trade-off in diffusion LLMs via temporal-aware self-distillation that applies hard labels to soon-to-be-decoded tokens and soft supervision to future tokens.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"In this paper, we focus on further accelerating dLLM inference by increasing the parallelism of these models. 5.2 Inference Acceleration for dLLMs The inference speed of dLLMs is primarily hindered by the incompatibility of traditional KV caching with bidirectional attention and the severe quality degradation during highly parallel decoding [7]. To alleviate the caching bottleneck, recent studies [44, 45, 46, 7] exploit the temporal consistency of KV states across decoding iterations to develop approximate caching mechanisms, significantly reducing redundant computations. To enhance parallelism, current approaches are categorized into training- free [7, 11, 12, 47, 48, 49] and training-based [10, 15, 50, 9, 14, 51, 52, 53] methods. Training-free"},{"citing_arxiv_id":"2605.00161","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Consistent Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-30T19:31:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CDLM introduces MPDC training for discrete diffusion models, recovering prior methods as limits and claiming new SOTA text generation performance especially at low sampling budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18995","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"$R^2$-dLLM: Accelerating Diffusion Large Language Models via Spatio-Temporal Redundancy Reduction","primary_cat":"cs.CL","submitted_at":"2026-04-21T02:26:08+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18471","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"NI Sampling: Accelerating Discrete Diffusion Sampling by Token Order Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:22:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NI Sampling accelerates discrete diffusion language models up to 14.3 times by training a neural indicator to select which tokens to sample at each step using a trajectory-preserving objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15750","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DepCap: Adaptive Block-Wise Parallel Decoding for Efficient Diffusion LM Inference","primary_cat":"cs.LG","submitted_at":"2026-04-17T06:53:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DepCap accelerates diffusion LM inference up to 5.63x by using last-block influence for adaptive block boundaries and conflict-free token selection for parallel decoding, with negligible quality loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09450","ref_index":32,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ECHO: Efficient Chest X-ray Report Generation with One-step Block Diffusion","primary_cat":"cs.LG","submitted_at":"2026-04-10T16:07:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ECHO introduces one-step block diffusion via Direct Conditional Distillation and Response-Asymmetric Diffusion to generate chest X-ray reports faster than autoregressive models while improving clinical metrics.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Cd4lm: Consistency distillation and adaptive decoding for diffusion language models. arXiv preprint arXiv:2601.02236, 2026. [30] Chin-Yew Lin. Rouge: A package for automatic evaluation of summaries. InText summarization branches out, 2004. [31] Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. Visual instruction tuning.Advances in neural information processing systems, 2023. [32] Zhiyuan Liu, Yicun Yang, Yaojie Zhang, Junjie Chen, Chang Zou, Qingyuan Wei, Shaobo Wang, and Lin- feng Zhang. dllm-cache: Accelerating diffusion large language models with adaptive caching.arXiv preprint arXiv:2506.06295, 2025. [33] Shen Nie, Fengqi Zhu, Zebin You, Xiaolu Zhang, Jingyang Ou, Jun Hu, JUN ZHOU, Yankai Lin, Ji-Rong Wen, and Chongxuan Li."},{"citing_arxiv_id":"2604.08302","ref_index":49,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DMax: Aggressive Parallel Decoding for dLLMs","primary_cat":"cs.LG","submitted_at":"2026-04-09T14:35:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DMax uses On-Policy Uniform Training and Soft Parallel Decoding to enable aggressive parallelism in dLLMs, raising TPF on GSM8K from 2.04 to 5.47 and on MBPP from 2.71 to 5.86 while preserving accuracy.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"on Artificial Intelligence, volume 40, pages 32186-32194, 2026. [48] Yang Liu, Pengxiang Ding, Tengyue Jiang, Xudong Wang, Wenxuan Song, Minghui Lin, Han Zhao, Hongyin Zhang, Zifeng Zhuang, Wei Zhao, et al. Mmada-vla: Large diffusion vision-language-action model with unified multi-modal instruction and generation.arXiv preprint arXiv:2603.25406, 2026. [49] Zhiyuan Liu, Yicun Yang, Yaojie Zhang, Junjie Chen, Chang Zou, Qingyuan Wei, Shaobo Wang, and Linfeng Zhang. dllm-cache: Accelerating diffusion large language models with adaptive caching.arXiv preprint arXiv:2506.06295, 2025. [50] Lingkun Long, Yushi Huang, Shihao Bai, Ruihao Gong, Jun Zhang, Ao Zhou, and Jianlei Yang. Focus- dllm: Accelerating long-context diffusion llm inference via confidence-guided context focusing."},{"citing_arxiv_id":"2603.07475","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Comparative analysis of Layer-wise Representational Capacity in AR and Diffusion LLMs","primary_cat":"cs.CL","submitted_at":"2026-03-08T05:31:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Diffusion language models form more global representations with early-layer redundancy compared to autoregressive models, allowing layer skipping for up to 18.75% FLOP savings while maintaining over 90% performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.20216","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Locally Coherent Parallel Decoding in Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-03-03T09:56:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CoDiLA adds a compact auxiliary AR model on diffusion latents to enforce local sequential validity during parallel token sampling in discrete diffusion language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.14067","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient-DLM: From Autoregressive to Diffusion Language Models, and Beyond in Speed","primary_cat":"cs.CL","submitted_at":"2025-12-16T04:12:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Efficient-DLM converts AR models to dLMs via block-wise causal attention and position-dependent masking, yielding higher accuracy and 2.7-4.5x throughput than Dream 7B and Qwen3 4B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.19982","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Diffusion Language Models Know the Answer Before Decoding","primary_cat":"cs.CL","submitted_at":"2025-08-27T15:40:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DLMs show early answer convergence allowing Prophet to cut decoding steps by up to 3.4x on LLaDA-8B and Dream-7B while keeping output quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}