{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:BWIAXPTYFCWBSFU2474RYSMEMR","short_pith_number":"pith:BWIAXPTY","schema_version":"1.0","canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","source":{"kind":"arxiv","id":"2406.10774","version":2},"attestation_state":"computed","paper":{"title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Baris Kasikci, Guangxuan Xiao, Jiaming Tang, Kan Zhu, Song Han, Yilong Zhao","submitted_at":"2024-06-16T01:33:02Z","abstract_excerpt":"As the demand for long-context large language models (LLMs) increases, models with context windows of up to 128K or 1M tokens are becoming increasingly prevalent. However, long-context LLM inference is challenging since the inference speed decreases significantly as the sequence length grows. This slowdown is primarily caused by loading a large KV cache during self-attention. Previous works have shown that a small portion of critical tokens will dominate the attention outcomes. However, we observe the criticality of a token highly depends on the query. To this end, we propose Quest, a query-aw"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2406.10774","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d3fe8b74b07aadd7ad629906ecb00045739da53c44bcccc04a230ee451b8b27b","abstract_canon_sha256":"f1e057c54ef5bad63ddb13e05dfd763685b60792f4462cce20f8292321c5e5bb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.376057Z","signature_b64":"BwFxHeAIP7DRZH0tdrVZsaLkoq4zUH2mnfTEhLEOJGrNheGlsSGiN8wndHv+VgiKj/9tWwF8IZiPnDc/RNmsDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","last_reissued_at":"2026-05-17T23:38:52.375607Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.375607Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Baris Kasikci, Guangxuan Xiao, Jiaming Tang, Kan Zhu, Song Han, Yilong Zhao","submitted_at":"2024-06-16T01:33:02Z","abstract_excerpt":"As the demand for long-context large language models (LLMs) increases, models with context windows of up to 128K or 1M tokens are becoming increasingly prevalent. However, long-context LLM inference is challenging since the inference speed decreases significantly as the sequence length grows. This slowdown is primarily caused by loading a large KV cache during self-attention. Previous works have shown that a small portion of critical tokens will dominate the attention outcomes. However, we observe the criticality of a token highly depends on the query. To this end, we propose Quest, a query-aw"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e17bf82a2fb63efa58f980c030e59b23bb06db738800a83ea02d5e926b469b52"},"source":{"id":"2406.10774","kind":"arxiv","version":2},"verdict":{"id":"8f5d9e8d-d7dd-47a6-91d0-2b9b803af69f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T14:04:56.497780Z","strongest_claim":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss.","one_line_summary":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks.","pith_extraction_headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention."},"references":{"count":72,"sample":[{"doi":"","year":2024,"title":"I ntroducing the next generation of C laude","work_id":"c7fa93db-1d56-45f0-9974-a574f30b2ab1","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Longbench: A bilingual, multitask benchmark for long context understanding, 2023","work_id":"f3203315-6a3d-43f9-bcf6-e6e35491b6d8","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Y., Ermon, S., Rudra, A., and Ré, C","work_id":"1a200b80-d0ee-4c53-b99a-b3acee1d6777","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"A., and Gardner, M","work_id":"5c114c15-2f4d-40c1-be92-e0aa233c38f2","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Model tells you what to discard: Adaptive kv cache compression for llms, 2024","work_id":"f3be5c3d-0cd9-483d-b8d7-8f79555b0294","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":72,"snapshot_sha256":"f73df1206bea4501fe32d4ec39db4fdd570bf4d28259bc5ec322c1b40005c80d","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.10774","created_at":"2026-05-17T23:38:52.375678+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.10774v2","created_at":"2026-05-17T23:38:52.375678+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.10774","created_at":"2026-05-17T23:38:52.375678+00:00"},{"alias_kind":"pith_short_12","alias_value":"BWIAXPTYFCWB","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"BWIAXPTYFCWBSFU2","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"BWIAXPTY","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":43,"internal_anchor_count":43,"sample":[{"citing_arxiv_id":"2502.01941","citing_title":"Semantic Integrity Matters: Benchmarking and Preserving High-Density Reasoning in KV Cache Compression","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2505.05772","citing_title":"Sparse Attention Remapping with Clustering for Efficient LLM Decoding on PIM","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2505.15269","citing_title":"LiveVLM: Efficient Online Video Understanding via Streaming-Oriented KV Cache and Retrieval","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2510.10129","citing_title":"CacheClip: Accelerating RAG with Effective KV Cache Reuse","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21603","citing_title":"DynaFlow: Transparent and Flexible Intra-Device Parallelism via Programmable Operator Scheduling","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22718","citing_title":"WorldKV: Efficient World Memory with World Retrieval and Compression","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2508.06526","citing_title":"PiKV: KV Cache Management System for Mixture of Experts","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2602.08686","citing_title":"CompilerKV: Risk-Adaptive KV Compression via Offline Experience Compilation","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2602.18196","citing_title":"RAT+: Train Dense, Infer Sparse -- Recurrence Augmented Attention for Dilated Inference","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18856","citing_title":"SPHERICAL KV: Angle-Domain Attention and Rate-Distortion Retention for Efficient Long-Context Inference","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18165","citing_title":"Elastic-dLLM: Position Preserving Context Compression and Augmentation of Diffusion LLMs","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18753","citing_title":"DashAttention: Differentiable and Adaptive Sparse Hierarchical Attention","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17613","citing_title":"VeriCache: Turning Lossy KV Cache into Lossless LLM Inference","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2507.21433","citing_title":"ReasonCache: Accelerating Large Reasoning Model Serving through KV Cache Sharing","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2507.21526","citing_title":"Accelerating Prefilling via Decoding-time Contribution Sparsity","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2508.16703","citing_title":"ShadowNPU: System and Algorithm Co-design for NPU-Centric On-Device LLM Inference","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2509.21623","citing_title":"OjaKV: Context-Aware Online Low-Rank KV Cache Compression","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2510.09883","citing_title":"DELTA: Dynamic Layer-Aware Token Attention for Efficient Long-Context Reasoning","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2511.03092","citing_title":"SnapStream: Efficient Long Sequence Decoding on Dataflow Accelerators","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2502.11089","citing_title":"Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention","ref_index":80,"is_internal_anchor":true},{"citing_arxiv_id":"2512.12087","citing_title":"BLASST: Dynamic BLocked Attention Sparsity via Softmax Thresholding","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2601.13684","citing_title":"HeteroCache: A Dynamic Retrieval Approach to Heterogeneous KV Cache Compression for Long-Context LLM Inference","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2502.13189","citing_title":"MoBA: Mixture of Block Attention for Long-Context LLMs","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2602.18196","citing_title":"RAT+: Train Dense, Infer Sparse -- Recurrence Augmented Attention for Dilated Inference","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11864","citing_title":"Very Efficient Listwise Multimodal Reranking for Long Documents","ref_index":61,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR","json":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR.json","graph_json":"https://pith.science/api/pith-number/BWIAXPTYFCWBSFU2474RYSMEMR/graph.json","events_json":"https://pith.science/api/pith-number/BWIAXPTYFCWBSFU2474RYSMEMR/events.json","paper":"https://pith.science/paper/BWIAXPTY"},"agent_actions":{"view_html":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR","download_json":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR.json","view_paper":"https://pith.science/paper/BWIAXPTY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.10774&json=true","fetch_graph":"https://pith.science/api/pith-number/BWIAXPTYFCWBSFU2474RYSMEMR/graph.json","fetch_events":"https://pith.science/api/pith-number/BWIAXPTYFCWBSFU2474RYSMEMR/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/action/timestamp_anchor","attest_storage":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/action/storage_attestation","attest_author":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/action/author_attestation","sign_citation":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/action/citation_signature","submit_replication":"https://pith.science/pith/BWIAXPTYFCWBSFU2474RYSMEMR/action/replication_record"}},"created_at":"2026-05-17T23:38:52.375678+00:00","updated_at":"2026-05-17T23:38:52.375678+00:00"}