{"total":30,"items":[{"citing_arxiv_id":"2605.13111","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pyramid Forcing: Head-Aware Pyramid KV Cache Policy for High-Quality Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-13T07:23:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pyramid Forcing classifies attention heads into Anchor, Wave, and Veil types and applies type-specific KV cache policies to improve long-horizon autoregressive video generation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12471","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KV-Fold: One-Step KV-Cache Recurrence for Long-Context Inference","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:53:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KV-Fold turns frozen transformers into stable long-context models by folding the KV cache across sequence chunks in repeated forward passes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11478","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FibQuant: Universal Vector Quantization for Random-Access KV-Cache Compression","primary_cat":"cs.AI","submitted_at":"2026-05-12T03:45:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FibQuant is a universal fixed-rate vector quantizer for KV-cache compression that uses a radial-angular codebook matched to the spherical-Beta source after Haar rotation and strictly outperforms scalar quantization at matched rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09735","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"KV-RM: Regularizing KV-Cache Movement for Static-Graph LLM Serving","primary_cat":"cs.AR","submitted_at":"2026-05-10T20:10:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KV-RM regularizes KV-cache movement in static-graph LLM serving via block paging and merge-staged transport to improve throughput, tail latency, and memory use for variable-length decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09649","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Make Each Token Count: Towards Improving Long-Context Performance with KV Cache Eviction","primary_cat":"cs.LG","submitted_at":"2026-05-10T16:47:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A unified learnable KV eviction policy with cross-layer calibration reduces memory and matches or exceeds full-cache performance on long-context tasks by retaining useful tokens and limiting attention dilution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09490","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Not All Thoughts Need HBM: Semantics-Aware Memory Hierarchy for LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-10T11:54:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A semantics-aware KV cache hierarchy offloads tokens to slower memory with zero approximation error, demonstrating that LLM reasoning accuracy depends only on the permanent eviction ratio and not on HBM residency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08840","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ReST-KV: Robust KV Cache Eviction with Layer-wise Output Reconstruction and Spatial-Temporal Smoothing","primary_cat":"cs.CL","submitted_at":"2026-05-09T09:49:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReST-KV formulates KV eviction as layer-wise output reconstruction optimization with spatial-temporal smoothing, outperforming baselines by 2.58% on LongBench and 15.2% on RULER while cutting decoding latency by 10.61x at 128k context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08580","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Slipstream: Trajectory-Grounded Compaction Validation for Long-Horizon Agents","primary_cat":"cs.MA","submitted_at":"2026-05-09T00:47:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Slipstream uses asynchronous compaction with trajectory-grounded judge validation to improve long-horizon agent accuracy by up to 8.8 percentage points and reduce latency by up to 39.7%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08317","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RDKV: Rate-Distortion Bit Allocation for Joint Eviction and Quantization of the KV Cache","primary_cat":"cs.LG","submitted_at":"2026-05-08T15:15:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RDKV derives per-token and per-channel weights from attention distortion, then uses reverse water-filling to assign bit-widths from full precision to zero after prefilling, recovering 97.81% accuracy with 2.48% cache retention on LongBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07719","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"An Efficient Hybrid Sparse Attention with CPU-GPU Parallelism for Long-Context Inference","primary_cat":"cs.LG","submitted_at":"2026-05-08T13:24:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Fluxion achieves 1.5x-3.7x speedup in long-context LLM inference with CPU KV caches while limiting accuracy degradation to at most 0.26 relative to full attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07234","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reformulating KV Cache Eviction Problem for Long-Context LLM Inference","primary_cat":"cs.CL","submitted_at":"2026-05-08T04:37:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LaProx reformulates KV cache eviction as an output-aware matrix approximation, enabling a unified global token selection strategy that preserves LLM performance at 5% cache size across long-context benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06763","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sparse Attention as a Range Searching Problem: Towards an Inference-Efficient Index for KV Cache","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:37:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Louver is a new index for LLM KV caches that guarantees zero false negatives for keys above a relevance threshold, runs faster than prior sparse and some dense attention methods, and integrates lightly into existing pipelines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06105","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Shallow Prefill, Deep Decoding: Efficient Long-Context Inference via Layer-Asymmetric KV Visibility","primary_cat":"cs.AI","submitted_at":"2026-05-07T12:21:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPEED uses layer-asymmetric KV visibility to process non-anchor prompt tokens only in lower layers during prefill, achieving near-baseline quality on Llama-3.1-8B with 33% better TTFT and 25% lower active KV memory at 128K context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24820","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Salca: A Sparsity-Aware Hardware Accelerator for Efficient Long-Context Attention Decoding","primary_cat":"cs.AR","submitted_at":"2026-04-27T14:06:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Salca is a new ASIC accelerator that achieves 3.82× speedup and 74.19× energy efficiency over A100 for long-context attention via dual-compression dynamic sparse attention and pipelined hardware.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19351","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DASH-KV: Accelerating Long-Context LLM Inference via Asymmetric KV Cache Hashing","primary_cat":"cs.CL","submitted_at":"2026-04-21T11:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASH-KV accelerates long-context LLM inference to linear complexity via asymmetric KV cache hashing and mixed-precision retention, matching full attention performance on LongBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17935","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Much Cache Does Reasoning Need? Depth-Cache Tradeoffs in KV-Compressed Transformers","primary_cat":"cs.LG","submitted_at":"2026-04-20T08:15:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Transformers need depth scaling as the product of ceil(k/s) and log n terms for k-hop pointer chasing under cache size s, with a conjectured lower bound, proved upper bound via windowed pointer doubling, and an adaptive-oblivious error separation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16983","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Graph-Guided Adaptive Channel Elimination for KV Cache Compression","primary_cat":"eess.SP","submitted_at":"2026-04-18T12:55:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRACE reframes KV cache channel pruning as graph optimization to find a near-optimal subset, achieving 60% compression with negligible degradation and outperforming prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16864","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HieraSparse: Hierarchical Semi-Structured Sparse KV Attention","primary_cat":"cs.DC","submitted_at":"2026-04-18T06:28:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HieraSparse delivers a hierarchical semi-structured sparse KV attention system that achieves 1.2x KV compression and 4.57x decode attention speedup versus prior unstructured sparsity methods at equivalent sparsity, plus up to 1.85x prefill speedup and 1.37x/1.77x speedups with magnitude pruning and ","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05219","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sparse Prefix Caching for Hybrid and Recurrent LLM Serving","primary_cat":"cs.LG","submitted_at":"2026-04-17T09:24:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparse prefix caching via dynamic programming for optimal checkpoint placement under overlap distributions improves the Pareto frontier for recurrent and hybrid LLM serving on shared-prefix data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04075","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RetentiveKV: State-Space Memory for Uncertainty-Aware Multimodal KV Cache Eviction","primary_cat":"cs.LG","submitted_at":"2026-04-14T08:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RetentiveKV uses entropy to drive state-space model transitions that retain and reactivate low-attention visual tokens in a continuous memory instead of pruning them, delivering 5x KV cache compression and 1.5x faster decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11288","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Transactional Attention: Semantic Sponsorship for KV-Cache Retention","primary_cat":"cs.CL","submitted_at":"2026-04-13T10:51:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Transactional Attention uses semantic sponsorship from anchor patterns to retain dormant critical tokens in KV caches, achieving 100% credential retrieval at 16 tokens where all prior methods fail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10098","ref_index":148,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-04-11T08:41:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The first survey on Attention Sink in Transformers structures the literature around fundamental utilization, mechanistic interpretation, and strategic mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08603","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Business Events to Auditable Decisions: Ontology-Governed Graph Simulation for Enterprise AI","primary_cat":"cs.AI","submitted_at":"2026-04-08T06:07:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LOM-action uses business events to drive ontology-governed graph simulations that generate auditable decisions, reporting 93.82% accuracy and 98.74% tool-chain F1 versus 24-36% F1 for frontier LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06694","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AudioKV: KV Cache Eviction in Efficient Large Audio Language Models","primary_cat":"cs.SD","submitted_at":"2026-04-08T05:20:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AudioKV prioritizes audio-critical attention heads identified via ASR analysis and applies spectral score smoothing to evict KV cache tokens, achieving high compression with minimal accuracy loss in LALMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06036","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CodecSight: Leveraging Video Codec Signals for Efficient Streaming VLM Inference","primary_cat":"cs.DC","submitted_at":"2026-04-07T16:31:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CodecSight reuses video codec signals for online patch pruning before the vision transformer and selective KV-cache refresh in the LLM, delivering up to 3x higher throughput and 87% lower GPU compute than prior baselines with 0-8% F1 drop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04921","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TriAttention: Efficient Long Reasoning with Trigonometric KV Compression","primary_cat":"cs.CL","submitted_at":"2026-04-06T17:58:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TriAttention compresses KV cache by exploiting stable pre-RoPE Q/K concentration and trigonometric distance preferences to match full-attention reasoning accuracy with far lower memory and higher speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02905","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"eOptShrinkQ: Near-Lossless KV Cache Compression Through Optimal Spectral Denoising and Quantization","primary_cat":"cs.LG","submitted_at":"2026-04-06T02:05:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"eOptShrinkQ compresses KV caches to ~2.2 bits per entry via optimal spectral shrinkage and quantization, outperforming prior methods on LongBench while matching FP16 on multi-needle retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03679","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LightThinker++: From Reasoning Compression to Memory Management","primary_cat":"cs.CL","submitted_at":"2026-04-04T10:46:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LightThinker++ adds explicit adaptive memory management and a trajectory synthesis pipeline to LLM reasoning, cutting peak token use by ~70% while gaining accuracy in standard and long-horizon agent tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22782","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stochastic KV Routing: Enabling Adaptive Depth-Wise Cache Sharing","primary_cat":"cs.LG","submitted_at":"2026-04-03T14:56:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stochastic training with random cross-layer KV attention enables depth-wise cache sharing in transformers, cutting memory footprint while preserving or improving performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.02922","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RetroInfer: A Vector Storage Engine for Scalable Long-Context LLM Inference","primary_cat":"cs.LG","submitted_at":"2025-05-05T18:01:17+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}