{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:BWIAXPTYFCWBSFU2474RYSMEMR","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f1e057c54ef5bad63ddb13e05dfd763685b60792f4462cce20f8292321c5e5bb","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","title_canon_sha256":"d3fe8b74b07aadd7ad629906ecb00045739da53c44bcccc04a230ee451b8b27b"},"schema_version":"1.0","source":{"id":"2406.10774","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2406.10774","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2406.10774v2","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.10774","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"BWIAXPTYFCWB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BWIAXPTYFCWBSFU2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BWIAXPTY","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c0f1f83b3c42494387f45d8dde7a997e6dbacf0b2c659806288b1fcd2d3033e8","target":"graph","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention."}],"snapshot_sha256":"e17bf82a2fb63efa58f980c030e59b23bb06db738800a83ea02d5e926b469b52"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"As the demand for long-context large language models (LLMs) increases, models with context windows of up to 128K or 1M tokens are becoming increasingly prevalent. However, long-context LLM inference is challenging since the inference speed decreases significantly as the sequence length grows. This slowdown is primarily caused by loading a large KV cache during self-attention. Previous works have shown that a small portion of critical tokens will dominate the attention outcomes. However, we observe the criticality of a token highly depends on the query. To this end, we propose Quest, a query-aw","authors_text":"Baris Kasikci, Guangxuan Xiao, Jiaming Tang, Kan Zhu, Song Han, Yilong Zhao","cross_cats":["cs.LG"],"headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","title":"Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference"},"references":{"count":72,"internal_anchors":0,"resolved_work":72,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"I ntroducing the next generation of C laude","work_id":"c7fa93db-1d56-45f0-9974-a574f30b2ab1","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Longbench: A bilingual, multitask benchmark for long context understanding, 2023","work_id":"f3203315-6a3d-43f9-bcf6-e6e35491b6d8","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Y., Ermon, S., Rudra, A., and Ré, C","work_id":"1a200b80-d0ee-4c53-b99a-b3acee1d6777","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"A., and Gardner, M","work_id":"5c114c15-2f4d-40c1-be92-e0aa233c38f2","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Model tells you what to discard: Adaptive kv cache compression for llms, 2024","work_id":"f3be5c3d-0cd9-483d-b8d7-8f79555b0294","year":2024}],"snapshot_sha256":"f73df1206bea4501fe32d4ec39db4fdd570bf4d28259bc5ec322c1b40005c80d"},"source":{"id":"2406.10774","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T14:04:56.497780Z","id":"8f5d9e8d-d7dd-47a6-91d0-2b9b803af69f","model_set":{"reader":"grok-4.3"},"one_line_summary":"Quest speeds up long-context LLM self-attention by up to 2.23x via query-dependent selection of top-K critical KV cache pages, cutting overall latency by 7.03x with negligible accuracy loss.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Quest selects only the top-K critical KV cache pages using query vectors and min-max key bounds to accelerate long-context LLM attention.","strongest_claim":"By only loading the Top-K critical KV cache pages for attention, Quest significantly speeds up self-attention without sacrificing accuracy. We show that Quest can achieve up to 2.23x self-attention speedup, which reduces inference latency by 7.03x while performing well on tasks with long dependencies with negligible accuracy loss.","weakest_assumption":"That the min/max key approximation per page, combined with query-vector scoring, reliably identifies the truly critical pages without dropping information that would change the final attention output on long-dependency tasks."}},"verdict_id":"8f5d9e8d-d7dd-47a6-91d0-2b9b803af69f"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c04ec6e98b1226e7d3bdc8e1986d1c02c415daff7dd2347ab6a4cc0f8585ed01","target":"record","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f1e057c54ef5bad63ddb13e05dfd763685b60792f4462cce20f8292321c5e5bb","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-16T01:33:02Z","title_canon_sha256":"d3fe8b74b07aadd7ad629906ecb00045739da53c44bcccc04a230ee451b8b27b"},"schema_version":"1.0","source":{"id":"2406.10774","kind":"arxiv","version":2}},"canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0d900bbe7828ac19169ae7f91c49846449e053ca56781d809c9ea37308fe874f","first_computed_at":"2026-05-17T23:38:52.375607Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:52.375607Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"BwFxHeAIP7DRZH0tdrVZsaLkoq4zUH2mnfTEhLEOJGrNheGlsSGiN8wndHv+VgiKj/9tWwF8IZiPnDc/RNmsDA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:52.376057Z","signed_message":"canonical_sha256_bytes"},"source_id":"2406.10774","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c04ec6e98b1226e7d3bdc8e1986d1c02c415daff7dd2347ab6a4cc0f8585ed01","sha256:c0f1f83b3c42494387f45d8dde7a997e6dbacf0b2c659806288b1fcd2d3033e8"],"state_sha256":"d94b3fef9fa292c03fdbcd401a9dfbdf31675489be87edbd278f9f59de9d0ab0"}