{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2016:XWBREOWAINH2367UQNKRHZIJLY","short_pith_number":"pith:XWBREOWA","schema_version":"1.0","canonical_sha256":"bd83123ac0434fadfbf4835513e5095e01fe56e1416e53ee9644751aafbbef6a","source":{"kind":"arxiv","id":"1610.02132","version":4},"attestation_state":"computed","paper":{"title":"QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.DS"],"primary_cat":"cs.LG","authors_text":"Dan Alistarh, Demjan Grubic, Jerry Li, Milan Vojnovic, Ryota Tomioka","submitted_at":"2016-10-07T03:44:34Z","abstract_excerpt":"Parallel implementations of stochastic gradient descent (SGD) have received significant research attention, thanks to excellent scalability properties of this algorithm, and to its efficiency in the context of training deep neural networks. A fundamental barrier for parallelizing large-scale SGD is the fact that the cost of communicating the gradient updates between nodes can be very large. Consequently, lossy compression heuristics have been proposed, by which nodes only communicate quantized gradients. Although effective in practice, these heuristics do not always provably converge, and it i"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1610.02132","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2016-10-07T03:44:34Z","cross_cats_sorted":["cs.DS"],"title_canon_sha256":"824ac99066239ccb15c2319a0bb9ae2c90ccdfeea423699f73873f9e92a6c22e","abstract_canon_sha256":"bbebbbde7593a0cb60d7e6baca2823a8cf12c720ed1654e841d7cfafbfcb807f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:28:41.216095Z","signature_b64":"0sisEuDzyw+GXQb0DhAf8LGs1mWudvyivNeUO60JTvhIqzIBUw2CE69umKT/8sBvOQYoFrrya15zPcRgH2ZCAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bd83123ac0434fadfbf4835513e5095e01fe56e1416e53ee9644751aafbbef6a","last_reissued_at":"2026-05-18T00:28:41.215275Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:28:41.215275Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.DS"],"primary_cat":"cs.LG","authors_text":"Dan Alistarh, Demjan Grubic, Jerry Li, Milan Vojnovic, Ryota Tomioka","submitted_at":"2016-10-07T03:44:34Z","abstract_excerpt":"Parallel implementations of stochastic gradient descent (SGD) have received significant research attention, thanks to excellent scalability properties of this algorithm, and to its efficiency in the context of training deep neural networks. A fundamental barrier for parallelizing large-scale SGD is the fact that the cost of communicating the gradient updates between nodes can be very large. Consequently, lossy compression heuristics have been proposed, by which nodes only communicate quantized gradients. Although effective in practice, these heuristics do not always provably converge, and it i"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1610.02132","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1610.02132","created_at":"2026-05-18T00:28:41.215434+00:00"},{"alias_kind":"arxiv_version","alias_value":"1610.02132v4","created_at":"2026-05-18T00:28:41.215434+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1610.02132","created_at":"2026-05-18T00:28:41.215434+00:00"},{"alias_kind":"pith_short_12","alias_value":"XWBREOWAINH2","created_at":"2026-05-18T12:30:51.357362+00:00"},{"alias_kind":"pith_short_16","alias_value":"XWBREOWAINH2367U","created_at":"2026-05-18T12:30:51.357362+00:00"},{"alias_kind":"pith_short_8","alias_value":"XWBREOWA","created_at":"2026-05-18T12:30:51.357362+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":7,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2509.03472","citing_title":"DPQuant: Efficient and Differentially-Private Model Training via Dynamic Quantization Scheduling","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12396","citing_title":"NCCLZ: Compression-Enabled GPU Collectives with Decoupled Quantization and Entropy Coding","ref_index":24,"is_internal_anchor":false},{"citing_arxiv_id":"1610.05492","citing_title":"Federated Learning: Strategies for Improving Communication Efficiency","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"2604.25467","citing_title":"Subspace Optimization for Efficient Federated Learning under Heterogeneous Data","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.25550","citing_title":"Enhancing SignSGD: Small-Batch Convergence Analysis and a Hybrid Switching Strategy","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2605.01989","citing_title":"DBLP: Phase-Aware Bounded-Loss Transport for Burst-Resilient Distributed ML Training","ref_index":10,"is_internal_anchor":false},{"citing_arxiv_id":"2604.21428","citing_title":"Decoupled DiLoCo for Resilient Distributed Pre-training","ref_index":1,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY","json":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY.json","graph_json":"https://pith.science/api/pith-number/XWBREOWAINH2367UQNKRHZIJLY/graph.json","events_json":"https://pith.science/api/pith-number/XWBREOWAINH2367UQNKRHZIJLY/events.json","paper":"https://pith.science/paper/XWBREOWA"},"agent_actions":{"view_html":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY","download_json":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY.json","view_paper":"https://pith.science/paper/XWBREOWA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1610.02132&json=true","fetch_graph":"https://pith.science/api/pith-number/XWBREOWAINH2367UQNKRHZIJLY/graph.json","fetch_events":"https://pith.science/api/pith-number/XWBREOWAINH2367UQNKRHZIJLY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY/action/storage_attestation","attest_author":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY/action/author_attestation","sign_citation":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY/action/citation_signature","submit_replication":"https://pith.science/pith/XWBREOWAINH2367UQNKRHZIJLY/action/replication_record"}},"created_at":"2026-05-18T00:28:41.215434+00:00","updated_at":"2026-05-18T00:28:41.215434+00:00"}