{"total":58,"items":[{"citing_arxiv_id":"2605.13190","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"N-vium: Mixture-of-Exits Transformer for Accelerated Exact Generation","primary_cat":"cs.LG","submitted_at":"2026-05-13T08:46:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"N-vium achieves 57.9% wall-clock speedup over matched standard transformers at no perplexity cost by mixing exact predictions from multiple model depths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11582","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient LLM-based Advertising via Model Compression and Parallel Verification","primary_cat":"cs.CL","submitted_at":"2026-05-12T06:04:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"An Efficient Generative Targeting framework accelerates LLM inference in advertising via adaptive group quantization, layer-adaptive hierarchical sparsification, and prefix-tree parallel verification while accepting limited quality degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11186","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CATS: Cascaded Adaptive Tree Speculation for Memory-Limited LLM Inference Acceleration","primary_cat":"cs.LG","submitted_at":"2026-05-11T19:50:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CATS achieves up to 5.08x wall-clock speedup for LLM generation on edge devices via memory-matched cascaded tree speculation, outperforming prior methods by 1.45x with no quality loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10933","ref_index":71,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DECO: Sparse Mixture-of-Experts with Dense-Comparable Performance on End-Side Devices","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:58:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DECO matches dense model performance at 20% expert activation via ReLU-based routing with learnable scaling and the NormSiLU activation, plus a 3x real-hardware speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10453","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SlimSpec: Low-Rank Draft LM-Head for Accelerated Speculative Decoding","primary_cat":"cs.LG","submitted_at":"2026-05-11T12:22:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SlimSpec replaces the standard LM-head in draft models with a low-rank version to deliver 4-5x faster speculative decoding while preserving full vocabulary and competitive acceptance rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10195","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Breaking the Reward Barrier: Accelerating Tree-of-Thought Reasoning via Speculative Exploration","primary_cat":"cs.LG","submitted_at":"2026-05-11T08:45:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPEX accelerates Tree-of-Thought LLM reasoning 1.2-3x via speculative path selection, dynamic budget allocation across queries, and adaptive early termination, with up to 4.1x when combined with token speculative decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10124","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GELATO: Generative Entropy- and Lyapunov-based Adaptive Token Offloading for Device-Edge Speculative LLM Inference","primary_cat":"cs.NI","submitted_at":"2026-05-11T07:38:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GELATO combines drift-plus-penalty Lyapunov control with generative entropy early exiting to adaptively offload tokens in device-edge speculative decoding, delivering higher throughput and lower energy use than prior distributed SD systems while preserving output quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09992","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attention Drift: What Autoregressive Speculative Decoding Models Learn","primary_cat":"cs.LG","submitted_at":"2026-05-11T05:08:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Drafter models in speculative decoding suffer progressive attention drift caused by monotonically growing hidden-state magnitudes along the residual path; post-norm plus per-state RMSNorm reduces this drift and improves acceptance length up to 2x on perturbed templates and 1.18x on long-context data","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09603","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Edit-Based Refinement for Parallel Masked Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-10T15:31:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ME-DLM augments parallel masked diffusion models with edit-distance-supervised refinements to raise quality on coding and math benchmarks while using far fewer diffusion steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09329","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Test-Time Speculation","primary_cat":"cs.CL","submitted_at":"2026-05-10T05:02:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Test-Time Speculation adapts draft models online via target-model verifications to sustain high acceptance lengths during long LLM generations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08632","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PARD-2: Target-Aligned Parallel Draft Model for Dual-Mode Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-05-09T02:50:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PARD-2 uses Confidence-Adaptive Token optimization to align draft model training with acceptance length in speculative decoding, enabling dual-mode operation and up to 6.94x lossless speedup on Llama3.1-8B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07698","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Future Validity is the Missing Statistic: From Impossibility to $\\Phi$-Estimation for Grammar-Faithful Speculative Decoding","primary_cat":"cs.LG","submitted_at":"2026-05-08T13:08:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Speculative decoding under local grammar masking samples from the projected distribution μ^proj instead of the grammar-conditional μ*, and the future-validity function Φ corrects it via a Doob transform to achieve exact sampling from μ*.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07686","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Coupling Tax: How Shared Token Budgets Undermine Visible Chain-of-Thought Under Fixed Output Limits","primary_cat":"cs.LG","submitted_at":"2026-05-08T12:54:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Shared token budgets between visible chain-of-thought and answers create a coupling tax that makes non-thinking competitive on math benchmarks, with a truncation decomposition predicting the crossover and split budgets improving results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07307","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Dense Sequential Chains: Reasoning Language Models Can Extract Answers from Sparse, Order-Shuffling Chain-of-Thoughts","primary_cat":"cs.CL","submitted_at":"2026-05-08T06:15:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reasoning language models extract answers from sparse, order-shuffled chain-of-thought traces with little accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07243","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SpecBlock: Block-Iterative Speculative Decoding with Dynamic Tree Drafting","primary_cat":"cs.CL","submitted_at":"2026-05-08T04:59:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpecBlock achieves 8-19% higher speedup than EAGLE-3 in LLM speculative decoding by using repeated block expansions with hidden-state inheritance, a dynamic rank head, and a valid-prefix training mask.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07230","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CASCADE: Context-Aware Relaxation for Speculative Image Decoding","primary_cat":"cs.CV","submitted_at":"2026-05-08T04:32:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CASCADE formalizes semantic interchangeability and convergence in target model representations to enable context-aware acceptance relaxation in tree-based speculative decoding, delivering up to 3.6x speedup on text-to-image models without quality loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06116","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Policy-Guided Stepwise Model Routing for Cost-Effective Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-07T12:26:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A small RL-trained policy for stepwise model routing between LLM sizes improves the accuracy-cost tradeoff on math benchmarks over handcrafted strategies and matches large process reward model methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05802","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Selective Rollout: Mid-Trajectory Termination for Multi-Sample Agent RL","primary_cat":"cs.LG","submitted_at":"2026-05-07T07:41:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A one-parameter early-termination gate based on mean pairwise prefix edit distance reduces wall-clock time by 10.7% and raises held-out success by 2.5 pp in GRPO on ALFWorld by cutting zero-advantage batch dilution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08221","ref_index":91,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NoisyCoconut: Counterfactual Consensus via Latent Space Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-06T13:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Injecting noise into LLM latent trajectories creates diverse reasoning paths whose agreement acts as a confidence signal for selective abstention, cutting error rates from 40-70% to under 15% on math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04543","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniVer: A Unified Perspective for Multi-step and Multi-draft Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-05-06T06:42:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniVer frames tree-based speculative decoding as conditional optimal transport, proving it is lossless with optimal acceptance rates and delivering 4.2-8.5% longer accepted sequences than standard rejection sampling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02888","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SpecKV: Adaptive Speculative Decoding with Compression-Aware Gamma Selection","primary_cat":"cs.LG","submitted_at":"2026-05-04T17:55:05+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpecKV uses a small MLP trained on draft model confidence and entropy to dynamically choose the optimal speculation length gamma, achieving 56% better performance than fixed gamma=4 across various tasks and compression levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08151","ref_index":24,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPECTRE: Hybrid Ordinary-Parallel Speculative Serving for Resource-Efficient LLM Inference","primary_cat":"cs.DC","submitted_at":"2026-05-04T01:27:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPECTRE achieves up to 2.28x speedup for large-model LLM serving by running speculative draft generation and target verification in parallel using idle tail-model services.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01111","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Less is Enough: Efficient Inference via Collaborative Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-01T21:31:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A large model generates a compact reasoning signal that a small model uses to solve tasks, reducing the large model's output tokens by up to 60% on benchmarks like AIME and GPQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01106","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Component-Aware Self-Speculative Decoding in Hybrid Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-01T21:25:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Component-aware self-speculative decoding achieves high acceptance rates in parallel hybrid models like Falcon-H1 but fails in sequential ones like Qwen3.5, with the gap tied to how components are integrated.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27747","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position-Aware Drafting for Inference Acceleration in LLM-Based Generative List-Wise Recommendation","primary_cat":"cs.IR","submitted_at":"2026-04-30T11:37:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PAD-Rec augments standard draft models with item-position and step-position embeddings plus learnable gates, delivering up to 3.1x wall-clock speedup and 5% average gain over strong speculative-decoding baselines on four datasets while largely preserving recommendation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27476","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EdgeFM: Efficient Edge Inference for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-30T06:18:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EdgeFM is an agent-driven framework that strips non-essential features from VLMs and packages reusable optimized kernels, achieving up to 1.49x speedup over TensorRT-Edge-LLM on NVIDIA Orin while enabling first end-to-end deployment on Horizon Journey hardware.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26940","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Select to Think: Unlocking SLM Potential with Local Sufficiency","primary_cat":"cs.CL","submitted_at":"2026-04-29T17:51:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Small language models can achieve near large-model reasoning performance by learning to re-rank their own top-K token predictions after distilling selection from the large model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26837","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unifying Sparse Attention with Hierarchical Memory for Scalable Long-Context LLM Serving","primary_cat":"cs.LG","submitted_at":"2026-04-29T16:02:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPIN co-designs sparse attention with hierarchical memory to achieve 1.66-5.66x higher throughput, 7-9x lower TTFT, and up to 58% lower TPOT than vLLM and original sparse implementations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26469","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Empirical Study of Speculative Decoding on Software Engineering Tasks","primary_cat":"cs.SE","submitted_at":"2026-04-29T09:26:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Speculative decoding accelerates LLM inference on SE tasks without accuracy loss, with model-based methods suiting code generation and model-free methods suiting repository-level repair and editing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26412","ref_index":1,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Hidden States Drift: Can KV Caches Rescue Long-Range Speculative Decoding?","primary_cat":"cs.CL","submitted_at":"2026-04-29T08:25:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KV cache reuse improves long-range draft acceptance in speculative decoding but delivers only marginal end-to-end speedups due to drafter limitations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25777","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpecFed: Accelerating Federated LLM Inference with Speculative Decoding and Compressed Transmission","primary_cat":"eess.SP","submitted_at":"2026-04-28T15:44:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SpecFed accelerates federated LLM inference via speculative decoding for parallel processing and top-K compression with server-side reconstruction, achieving high fidelity with reduced communication overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24110","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latency and Cost of Multi-Agent Intelligent Tutoring at Scale","primary_cat":"cs.CY","submitted_at":"2026-04-27T07:07:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Priority PayGo keeps multi-agent tutoring responses under 4 seconds even at 50 concurrent users, while costs stay below textbook prices per student.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23623","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tandem: Riding Together with Large and Small Language Models for Efficient Reasoning","primary_cat":"cs.AI","submitted_at":"2026-04-26T09:33:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Tandem lets a large model supply compact strategic guidance to a small model for reasoning tasks, achieving similar or better performance at about 40 percent lower cost through adaptive early stopping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23585","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ComplianceNLP: Knowledge-Graph-Augmented RAG for Multi-Framework Regulatory Gap Detection","primary_cat":"cs.CL","submitted_at":"2026-04-26T07:44:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ComplianceNLP integrates knowledge-graph-augmented RAG, multi-task legal text extraction, and gap analysis to detect regulatory compliance gaps, reporting 87.7 F1 and real-world efficiency gains over GPT-4o baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21072","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Distributed Generative Inference of LLM at Internet Scales with Multi-Dimensional Communication Optimization","primary_cat":"cs.DC","submitted_at":"2026-04-22T20:36:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BloomBee is a distributed LLM inference system that achieves up to 1.76x higher throughput and 43.2% lower latency than prior decentralized systems by optimizing communication across multiple dimensions in low-bandwidth internet settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21026","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MCAP: Deployment-Time Layer Profiling for Memory-Constrained LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-04-22T19:18:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MCAP uses load-time Monte Carlo profiling to estimate layer importance, enabling dynamic quantization (W4A8 vs W4A16) and memory tiering (GPU/RAM/SSD) that delivers 1.5-1.8x higher decode throughput than llama-cpp Q4_0 on NVIDIA T4 while fitting models into previously infeasible memory budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20503","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FASER: Fine-Grained Phase Management for Speculative Decoding in Dynamic LLM Serving","primary_cat":"cs.DC","submitted_at":"2026-04-22T12:44:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FASER delivers up to 53% higher throughput and 1.92x lower latency in dynamic LLM serving by adjusting speculative lengths per request, early pruning of rejects, and overlapping draft/verification phases via frontiers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20919","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DiP-SD: Distributed Pipelined Speculative Decoding for Efficient LLM Inference at the Edge","primary_cat":"cs.IT","submitted_at":"2026-04-22T04:02:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiP-SD jointly optimizes batch count, user-to-batch assignment, and per-user draft lengths to deliver up to 17.89x throughput over autoregressive decoding and 1.93x over greedy batching in a device-edge Qwen deployment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18170","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Copy-as-Decode: Grammar-Constrained Parallel Prefill for LLM Editing","primary_cat":"cs.CL","submitted_at":"2026-04-20T12:29:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Copy-as-Decode recasts LLM editing as grammar-constrained decoding over copy and generate primitives, delivering closed-form upper-bound speedups of 13x pooled on editing benchmarks via parallel prefill without any training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17701","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WISV: Wireless-Informed Semantic Verification for Distributed Speculative Decoding in Device-Edge LLM Inference","primary_cat":"cs.IT","submitted_at":"2026-04-20T01:29:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WISV uses a channel-aware semantic acceptance policy on hidden representations to boost accepted sequence length by up to 60.8% and cut interaction rounds by 37.3% in distributed speculative decoding, with under 1% accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17397","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Speculative Decoding for Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-19T12:01:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A training-free speculative decoding method for block-based autoregressive video diffusion uses a quality router on worst-frame ImageReward scores to accept drafter proposals, achieving up to 2.09x speedup at 95.7% quality retention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15244","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Tokens to Steps: Verification-Aware Speculative Decoding for Efficient Multi-Step Reasoning","primary_cat":"cs.CL","submitted_at":"2026-04-16T17:20:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpecGuard adds step-level verification to speculative decoding via attention grounding and log-probability scores, yielding 3.6% higher accuracy and 11% lower latency on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14682","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Acceptance Dynamics Across Cognitive Domains in Speculative Decoding","primary_cat":"cs.AI","submitted_at":"2026-04-16T06:38:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical measurements across four NLP domains show task type is a stronger predictor of speculative decoding acceptance than tree depth, with chat uniquely achieving expected accepted length over 1 token per step.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12989","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Accelerating Speculative Decoding with Block Diffusion Draft Trees","primary_cat":"cs.CL","submitted_at":"2026-04-14T17:23:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DDTree builds a draft tree from a block diffusion drafter using a best-first heap on its output probabilities and verifies the tree in one target-model pass via an ancestor-only attention mask, increasing average accepted tokens per round.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12503","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Topology-Aware Reasoning over Incomplete Knowledge Graph with Graph-Based Soft Prompting","primary_cat":"cs.CL","submitted_at":"2026-04-14T09:27:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A GNN-encoded subgraph soft prompting method lets LLMs perform topology-aware reasoning over incomplete KGs for KBQA, reaching SOTA on three of four benchmarks via a two-stage LLM pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09752","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A-IO: Adaptive Inference Orchestration for Memory-Bound NPUs","primary_cat":"cs.DC","submitted_at":"2026-04-10T14:34:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A-IO adaptively orchestrates LLM inference on NPUs to address memory bottlenecks, model scaling paradoxes, and synchronization costs in speculative decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09731","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SMART: When is it Actually Worth Expanding a Speculative Tree?","primary_cat":"cs.DC","submitted_at":"2026-04-09T13:17:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SMART uses marginal benefit-cost analysis to dynamically build efficient speculative trees, achieving 15-20% additional speedup in LLM and MLLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09722","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ConfigSpec: Profiling-Based Configuration Selection for Distributed Edge--Cloud Speculative LLM Serving","primary_cat":"cs.DC","submitted_at":"2026-04-08T21:54:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ConfigSpec shows that optimal configurations for speculative LLM inference conflict across goodput (favoring smallest drafters at device-specific K=2-10), cost (favoring largest drafters at K=2), and energy (favoring smallest drafters at K=2), requiring profiling-based selection instead of fixed or ","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07023","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MARS: Enabling Autoregressive Models Multi-Token Generation","primary_cat":"cs.CL","submitted_at":"2026-04-08T12:41:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MARS fine-tunes autoregressive models to predict multiple tokens per step via continued training on instruction data, achieving 1.5-1.7x throughput while matching baseline accuracy and supporting real-time speed adjustment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05250","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DualDiffusion: A Speculative Decoding Strategy for Masked Diffusion Models","primary_cat":"cs.LG","submitted_at":"2026-04-06T23:23:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DualDiffusion combines a lightweight drafter using approximations with a full verifier to reduce generation steps in masked diffusion models while keeping accuracy on MMLU and GSM8K.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}