{"work":{"id":"69eba45e-8338-46b7-b28a-e99bb687af56","openalex_id":null,"doi":null,"arxiv_id":"1805.02867","raw_key":null,"title":"Online normalizer calculation for softmax","authors":null,"authors_text":"Maxim Milakov and Natalia Gimelshein","year":2018,"venue":"cs.PF","abstract":"The Softmax function is ubiquitous in machine learning, multiple previous works suggested faster alternatives for it. In this paper we propose a way to compute classical Softmax with fewer memory accesses and hypothesize that this reduction in memory accesses should improve Softmax performance on actual hardware. The benchmarks confirm this hypothesis: Softmax accelerates by up to 1.3x and Softmax+TopK combined and fused by up to 5x.","external_url":"https://arxiv.org/abs/1805.02867","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-23T16:58:11.923454+00:00","pith_arxiv_id":"1805.02867","created_at":"2026-05-10T07:52:14.201811+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":true,"display_title":"Online normalizer calculation for softmax","render_title":"Online normalizer calculation for softmax"},"hub":{"state":{"work_id":"69eba45e-8338-46b7-b28a-e99bb687af56","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":23,"external_cited_by_count":null,"distinct_field_count":6,"first_pith_cited_at":"2022-05-27T17:53:09+00:00","last_pith_cited_at":"2026-04-28T07:13:26+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-11T04:17:34.446569+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":6}],"polarity_counts":[{"context_polarity":"background","n":6}],"runs":{},"summary":{},"graph":{},"authors":[]}}