{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:P42XLZOB6RLJ2G7GSHYN375FG2","short_pith_number":"pith:P42XLZOB","schema_version":"1.0","canonical_sha256":"7f3575e5c1f4569d1be691f0ddffa53688acad4a13f73e1dd00d7f19ef719af9","source":{"kind":"arxiv","id":"2503.01840","version":3},"attestation_state":"computed","paper":{"title":"EAGLE-3: Scaling up Inference Acceleration of Large Language Models via Training-Time Test","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"By switching to direct token prediction and multi-layer feature fusion, EAGLE-3 enables draft models to improve with increased training data for faster LLM inference.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Chao Zhang, Fangyun Wei, Hongyang Zhang, Yuhui Li","submitted_at":"2025-03-03T18:59:04Z","abstract_excerpt":"The sequential nature of modern LLMs makes them expensive and slow, and speculative sampling has proven to be an effective solution to this problem. Methods like EAGLE perform autoregression at the feature level, reusing top-layer features from the target model to achieve better results than vanilla speculative sampling. A growing trend in the LLM community is scaling up training data to improve model intelligence without increasing inference costs. However, we observe that scaling up data provides limited improvements for EAGLE. We identify that this limitation arises from EAGLE's feature pre"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":true},"canonical_record":{"source":{"id":"2503.01840","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-03-03T18:59:04Z","cross_cats_sorted":[],"title_canon_sha256":"ed257265660a32c954a9acae0b22dea0f1f57b9ae342a852e7d66a928032e253","abstract_canon_sha256":"d96088f2e47dac06a0977b5581508b768372ecfe4e76df8f9b2ab36b7028e971"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.539795Z","signature_b64":"BI7Do+xY1kN7uhsdmx0Pjwuym3YBCHVMBAjq3UvlG8hWxW1pwYpKuD11dByz2A8c1Z42EC4wyWN4XHYr4CP3BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7f3575e5c1f4569d1be691f0ddffa53688acad4a13f73e1dd00d7f19ef719af9","last_reissued_at":"2026-05-17T23:38:50.539247Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.539247Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"EAGLE-3: Scaling up Inference Acceleration of Large Language Models via Training-Time Test","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"By switching to direct token prediction and multi-layer feature fusion, EAGLE-3 enables draft models to improve with increased training data for faster LLM inference.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Chao Zhang, Fangyun Wei, Hongyang Zhang, Yuhui Li","submitted_at":"2025-03-03T18:59:04Z","abstract_excerpt":"The sequential nature of modern LLMs makes them expensive and slow, and speculative sampling has proven to be an effective solution to this problem. Methods like EAGLE perform autoregression at the feature level, reusing top-layer features from the target model to achieve better results than vanilla speculative sampling. A growing trend in the LLM community is scaling up training data to improve model intelligence without increasing inference costs. However, we observe that scaling up data provides limited improvements for EAGLE. We identify that this limitation arises from EAGLE's feature pre"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"EAGLE-3 abandons feature prediction in favor of direct token prediction and replaces reliance on top-layer features with multi-layer feature fusion via training-time test, significantly enhancing performance and enabling the draft model to fully benefit from scaling up training data, achieving up to 6.5x speedup and 1.4x over EAGLE-2.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That direct token prediction combined with multi-layer feature fusion via training-time test will remove the constraints that previously limited gains from scaling training data, without introducing new accuracy or stability issues in the draft model.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"EAGLE-3 reaches up to 6.5x LLM inference speedup by replacing feature prediction with direct token prediction and multi-layer fusion through training-time test.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"By switching to direct token prediction and multi-layer feature fusion, EAGLE-3 enables draft models to improve with increased training data for faster LLM inference.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"6cf639fb80120723be5de163add5a8aba12193891a4d2686572c5e14828ee6e6"},"source":{"id":"2503.01840","kind":"arxiv","version":3},"verdict":{"id":"f4dfb1d5-6f6f-44b5-abae-d7de670c8a0a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T18:48:28.987599Z","strongest_claim":"EAGLE-3 abandons feature prediction in favor of direct token prediction and replaces reliance on top-layer features with multi-layer feature fusion via training-time test, significantly enhancing performance and enabling the draft model to fully benefit from scaling up training data, achieving up to 6.5x speedup and 1.4x over EAGLE-2.","one_line_summary":"EAGLE-3 reaches up to 6.5x LLM inference speedup by replacing feature prediction with direct token prediction and multi-layer fusion through training-time test.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That direct token prediction combined with multi-layer feature fusion via training-time test will remove the constraints that previously limited gains from scaling training data, without introducing new accuracy or stability issues in the draft model.","pith_extraction_headline":"By switching to direct token prediction and multi-layer feature fusion, EAGLE-3 enables draft models to improve with increased training data for faster LLM inference."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"eea725488f826c92de03a8fb0152f69ddd473d002b240d7757c4797b32bd3efd"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2503.01840","created_at":"2026-05-17T23:38:50.539367+00:00"},{"alias_kind":"arxiv_version","alias_value":"2503.01840v3","created_at":"2026-05-17T23:38:50.539367+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2503.01840","created_at":"2026-05-17T23:38:50.539367+00:00"},{"alias_kind":"pith_short_12","alias_value":"P42XLZOB6RLJ","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"P42XLZOB6RLJ2G7G","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"P42XLZOB","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":34,"internal_anchor_count":34,"sample":[{"citing_arxiv_id":"2605.07243","citing_title":"SpecBlock: Block-Iterative Speculative Decoding with Dynamic Tree Drafting","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2601.07122","citing_title":"Enhancing Cloud Network Resilience via a Robust LLM-Empowered Multi-Agent Reinforcement Learning Framework","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2601.21484","citing_title":"ETS: Energy-Guided Test-Time Scaling for Training-Free RL Alignment","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19893","citing_title":"SSV: Sparse Speculative Verification for Efficient LLM Inference","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09329","citing_title":"Test-Time Speculation","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12825","citing_title":"Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15609","citing_title":"PSD: Pushing the Pareto Frontier of Diffusion LLMs via Parallel Speculative Decoding","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16786","citing_title":"Lever: Speculative LLM Inference on Smartphones","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14978","citing_title":"Performance-Driven Policy Optimization for Speculative Decoding with Adaptive Windowing","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2507.01449","citing_title":"LogitSpec: Accelerating Retrieval-based Speculative Decoding via Next Next Token Speculation","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2511.13587","citing_title":"VVS: Accelerating Speculative Decoding for Visual Autoregressive Generation via Partial Verification Skipping","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2511.14617","citing_title":"Seer: Online Context Learning for Fast Synchronous LLM Reinforcement Learning","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2601.21484","citing_title":"ETS: Energy-Guided Test-Time Scaling for Training-Free RL Alignment","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2602.06019","citing_title":"Multi-Token Prediction via Self-Distillation","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2603.01581","citing_title":"KERV: Kinematic-Rectified Speculative Decoding for Embodied VLA Models","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28049","citing_title":"Drift-AR: Single-Step Visual Autoregressive Generation via Anti-Symmetric Drifting","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12825","citing_title":"Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13778","citing_title":"Realtime-VLA FLASH: Speculative Inference Framework for Diffusion-based VLAs","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11093","citing_title":"Enabling Performant and Flexible Model-Internal Observability for LLM Inference","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11232","citing_title":"Rethinking LLMOps for Fraud and AML: Building a Compliance-Grade LLM Serving Stack","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26837","citing_title":"Unifying Sparse Attention with Hierarchical Memory for Scalable Long-Context LLM Serving","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26469","citing_title":"An Empirical Study of Speculative Decoding on Software Engineering Tasks","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08632","citing_title":"PARD-2: Target-Aligned Parallel Draft Model for Dual-Mode Speculative Decoding","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08862","citing_title":"BubbleSpec: Turning Long-Tail Bubbles into Speculative Rollout Drafts for Synchronous Reinforcement Learning","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10453","citing_title":"SlimSpec: Low-Rank Draft LM-Head for Accelerated Speculative Decoding","ref_index":9,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2","json":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2.json","graph_json":"https://pith.science/api/pith-number/P42XLZOB6RLJ2G7GSHYN375FG2/graph.json","events_json":"https://pith.science/api/pith-number/P42XLZOB6RLJ2G7GSHYN375FG2/events.json","paper":"https://pith.science/paper/P42XLZOB"},"agent_actions":{"view_html":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2","download_json":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2.json","view_paper":"https://pith.science/paper/P42XLZOB","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2503.01840&json=true","fetch_graph":"https://pith.science/api/pith-number/P42XLZOB6RLJ2G7GSHYN375FG2/graph.json","fetch_events":"https://pith.science/api/pith-number/P42XLZOB6RLJ2G7GSHYN375FG2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2/action/storage_attestation","attest_author":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2/action/author_attestation","sign_citation":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2/action/citation_signature","submit_replication":"https://pith.science/pith/P42XLZOB6RLJ2G7GSHYN375FG2/action/replication_record"}},"created_at":"2026-05-17T23:38:50.539367+00:00","updated_at":"2026-05-17T23:38:50.539367+00:00"}