{"total":16,"items":[{"citing_arxiv_id":"2605.12825","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion","primary_cat":"cs.LG","submitted_at":"2026-05-12T23:47:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Orthrus unifies autoregressive and diffusion views on a shared KV cache to deliver lossless parallel token generation with up to 7.8x speedup and O(1) memory overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11577","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BitLM: Unlocking Multi-Token Language Generation with Bitwise Continuous Diffusion","primary_cat":"cs.CL","submitted_at":"2026-05-12T06:02:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BitLM replaces per-token softmax with bitwise continuous diffusion inside causal blocks to generate multiple tokens in parallel while preserving autoregressive structure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09430","ref_index":1,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FlashAR: Efficient Post-Training Acceleration for Autoregressive Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-10T09:07:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlashAR accelerates autoregressive image generation up to 22.9x by post-training a pre-trained raster-scan model with a complementary vertical head and dynamic fusion for two-way next-token prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07748","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TextLDM: Language Modeling with Continuous Latent Diffusion","primary_cat":"cs.CL","submitted_at":"2026-05-08T13:54:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TextLDM applies DiT-style latent diffusion with flow matching to language modeling via a REPA-aligned VAE, outperforming prior diffusion LMs and matching GPT-2 when trained from scratch on OpenWebText2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04647","ref_index":87,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReflectDrive-2: Reinforcement-Learning-Aligned Self-Editing for Discrete Diffusion Driving","primary_cat":"cs.RO","submitted_at":"2026-05-06T08:52:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReflectDrive-2 combines masked discrete diffusion with RL-aligned self-editing to generate and refine driving trajectories, reaching 91.0 PDMS on NAVSIM camera-only and 94.8 in best-of-6.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08134","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DARE: Diffusion Language Model Activation Reuse for Efficient Inference","primary_cat":"cs.LG","submitted_at":"2026-05-01T19:15:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DARE reuses up to 87% of attention activations in diffusion LLMs through KV caching and output reuse, delivering 1.2x per-layer latency gains with average performance drops of 1.2-2.0%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18471","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NI Sampling: Accelerating Discrete Diffusion Sampling by Token Order Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:22:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NI Sampling accelerates discrete diffusion language models up to 14.3 times by training a neural indicator to select which tokens to sample at each step using a trajectory-preserving objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18344","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"One Pass for All: A Discrete Diffusion Model for Knowledge Graph Triple Set Prediction","primary_cat":"cs.AI","submitted_at":"2026-04-20T14:41:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DiffTSP applies discrete diffusion to knowledge graph triple set prediction, recovering all missing triples simultaneously via edge-masking noise reversal and a structure-aware transformer, achieving SOTA on three datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17068","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stability-Weighted Decoding for Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-18T17:04:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stability-Weighted Decoding improves diffusion LLM accuracy by modulating token scores with temporal stability from KL divergence between prediction steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16514","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BARD: Bridging AutoRegressive and Diffusion Vision-Language Models Via Highly Efficient Progressive Block Merging and Stage-Wise Distillation","primary_cat":"cs.CV","submitted_at":"2026-04-15T09:17:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BARD bridges autoregressive and diffusion VLMs with progressive block merging plus stage-wise intra-diffusion distillation, delivering 3x speedup and new SOTA on open dVLMs using under 4.4M data points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11748","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LangFlow: Continuous Diffusion Rivals Discrete in Language Modeling","primary_cat":"cs.CL","submitted_at":"2026-04-13T17:21:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LangFlow is the first continuous diffusion language model to rival discrete diffusion on perplexity and generative perplexity while exceeding autoregressive baselines on several zero-shot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08302","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DMax: Aggressive Parallel Decoding for dLLMs","primary_cat":"cs.LG","submitted_at":"2026-04-09T14:35:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DMax enables faster parallel decoding in diffusion language models by using on-policy training to recover from errors and soft embedding interpolations for iterative revision, boosting tokens per forward pass roughly 2-3x on benchmarks while preserving accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02718","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Generative Frontiers: Why Evaluation Matters for Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-03T04:21:20+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Generative perplexity and entropy are shown to be the two additive components of KL divergence to a reference distribution, motivating generative frontiers as a principled evaluation method for diffusion language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.15745","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLaDA2.0: Scaling Up Diffusion Language Models to 100B","primary_cat":"cs.LG","submitted_at":"2025-12-10T09:26:18+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLaDA2.0 scales discrete diffusion language models to 100B parameters via systematic conversion from autoregressive models using a 3-phase WSD training scheme and releases open-source 16B and 100B MoE variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.15487","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Dream 7B: Diffusion Large Language Models","primary_cat":"cs.CL","submitted_at":"2025-08-21T12:09:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dream 7B is a 7B diffusion LLM that refines sequences in parallel via denoising and outperforms prior diffusion models on general, mathematical, and coding benchmarks with added flexibility in generation order and quality-speed tradeoffs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.09992","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Large Language Diffusion Models","primary_cat":"cs.CL","submitted_at":"2025-02-14T08:23:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LLaDA is a scalable diffusion-based language model that matches autoregressive LLMs like LLaMA3 8B on tasks and surpasses GPT-4o on reversal poem completion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}