{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:JXMVLSVZ7B4A7F3JLM5PIYYILY","short_pith_number":"pith:JXMVLSVZ","schema_version":"1.0","canonical_sha256":"4dd955cab9f8780f97695b3af463085e19ab1f7da6f922959e527872d10dff1e","source":{"kind":"arxiv","id":"1711.00489","version":2},"attestation_state":"computed","paper":{"title":"Don't Decay the Learning Rate, Increase the Batch Size","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.DC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Chris Ying, Pieter-Jan Kindermans, Quoc V. Le, Samuel L. Smith","submitted_at":"2017-11-01T18:04:31Z","abstract_excerpt":"It is common practice to decay the learning rate. Here we show one can usually obtain the same learning curve on both training and test sets by instead increasing the batch size during training. This procedure is successful for stochastic gradient descent (SGD), SGD with momentum, Nesterov momentum, and Adam. It reaches equivalent test accuracies after the same number of training epochs, but with fewer parameter updates, leading to greater parallelism and shorter training times. We can further reduce the number of parameter updates by increasing the learning rate $\\epsilon$ and scaling the bat"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1711.00489","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2017-11-01T18:04:31Z","cross_cats_sorted":["cs.CV","cs.DC","stat.ML"],"title_canon_sha256":"af88ba0d8150a55a7cb0eac82afb70a51ef80b534d39ccdf8425d27d0cd268aa","abstract_canon_sha256":"39621e4a5702f2758ab584ed0e53cf446646b46bdec75751d49d22f4da1a8cb5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:22:39.281673Z","signature_b64":"1pP9i0ZXlAi88HeXvhwF0r66J0FAYJK9LthgXfiwaaVrGj8cAQL9ncn0PFjT0f8xdEppB/pTSBtXfx6DVpIMAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4dd955cab9f8780f97695b3af463085e19ab1f7da6f922959e527872d10dff1e","last_reissued_at":"2026-05-18T00:22:39.281193Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:22:39.281193Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Don't Decay the Learning Rate, Increase the Batch Size","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.DC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Chris Ying, Pieter-Jan Kindermans, Quoc V. Le, Samuel L. Smith","submitted_at":"2017-11-01T18:04:31Z","abstract_excerpt":"It is common practice to decay the learning rate. Here we show one can usually obtain the same learning curve on both training and test sets by instead increasing the batch size during training. This procedure is successful for stochastic gradient descent (SGD), SGD with momentum, Nesterov momentum, and Adam. It reaches equivalent test accuracies after the same number of training epochs, but with fewer parameter updates, leading to greater parallelism and shorter training times. We can further reduce the number of parameter updates by increasing the learning rate $\\epsilon$ and scaling the bat"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1711.00489","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1711.00489","created_at":"2026-05-18T00:22:39.281267+00:00"},{"alias_kind":"arxiv_version","alias_value":"1711.00489v2","created_at":"2026-05-18T00:22:39.281267+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1711.00489","created_at":"2026-05-18T00:22:39.281267+00:00"},{"alias_kind":"pith_short_12","alias_value":"JXMVLSVZ7B4A","created_at":"2026-05-18T12:31:24.725408+00:00"},{"alias_kind":"pith_short_16","alias_value":"JXMVLSVZ7B4A7F3J","created_at":"2026-05-18T12:31:24.725408+00:00"},{"alias_kind":"pith_short_8","alias_value":"JXMVLSVZ","created_at":"2026-05-18T12:31:24.725408+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":15,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2212.08989","citing_title":"Deep learning applied to computational mechanics: A comprehensive review, state of the art, and the classics","ref_index":166,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21557","citing_title":"Scalable On-Policy Reinforcement Learning via Adaptive Batch Scaling","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"1904.00962","citing_title":"Large Batch Optimization for Deep Learning: Training BERT in 76 minutes","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2507.00432","citing_title":"Does Math Reasoning Improve General LLM Capabilities? Understanding Transferability of LLM Reasoning","ref_index":242,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":153,"is_internal_anchor":true},{"citing_arxiv_id":"2602.04774","citing_title":"Theory of Optimal Learning Rate Schedules and Scaling Laws for a Random Feature Model","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2404.06395","citing_title":"MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies","ref_index":37,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11255","citing_title":"HEBATRON: A Hebrew-Specialized Open-Weight Mixture-of-Experts Language Model","ref_index":36,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10577","citing_title":"Training continuously-coupled reconfigurable photonic chips with quantum machine learning","ref_index":64,"is_internal_anchor":false},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":195,"is_internal_anchor":false},{"citing_arxiv_id":"2401.02954","citing_title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","ref_index":159,"is_internal_anchor":false},{"citing_arxiv_id":"2405.04434","citing_title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","ref_index":157,"is_internal_anchor":false},{"citing_arxiv_id":"2604.04681","citing_title":"Batch Loss Score for Dynamic Data Pruning","ref_index":42,"is_internal_anchor":false},{"citing_arxiv_id":"2205.01068","citing_title":"OPT: Open Pre-trained Transformer Language Models","ref_index":88,"is_internal_anchor":false},{"citing_arxiv_id":"2207.05221","citing_title":"Language Models (Mostly) Know What They Know","ref_index":273,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY","json":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY.json","graph_json":"https://pith.science/api/pith-number/JXMVLSVZ7B4A7F3JLM5PIYYILY/graph.json","events_json":"https://pith.science/api/pith-number/JXMVLSVZ7B4A7F3JLM5PIYYILY/events.json","paper":"https://pith.science/paper/JXMVLSVZ"},"agent_actions":{"view_html":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY","download_json":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY.json","view_paper":"https://pith.science/paper/JXMVLSVZ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1711.00489&json=true","fetch_graph":"https://pith.science/api/pith-number/JXMVLSVZ7B4A7F3JLM5PIYYILY/graph.json","fetch_events":"https://pith.science/api/pith-number/JXMVLSVZ7B4A7F3JLM5PIYYILY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY/action/storage_attestation","attest_author":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY/action/author_attestation","sign_citation":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY/action/citation_signature","submit_replication":"https://pith.science/pith/JXMVLSVZ7B4A7F3JLM5PIYYILY/action/replication_record"}},"created_at":"2026-05-18T00:22:39.281267+00:00","updated_at":"2026-05-18T00:22:39.281267+00:00"}