{"work":{"id":"e96730e3-129b-4db6-b981-15ab7932e297","openalex_id":null,"doi":null,"arxiv_id":"2010.11929","raw_key":null,"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","authors":null,"authors_text":"Dosovitskiy, A","year":2020,"venue":"cs.CV","abstract":"While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.","external_url":"https://arxiv.org/abs/2010.11929","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-14T20:39:26.614720+00:00","pith_arxiv_id":"2010.11929","created_at":"2026-05-08T23:09:28.608863+00:00","updated_at":"2026-05-14T20:39:26.614720+00:00","title_quality_ok":true,"display_title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","render_title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale"},"hub":{"state":{"work_id":"e96730e3-129b-4db6-b981-15ab7932e297","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":423,"external_cited_by_count":null,"distinct_field_count":41,"first_pith_cited_at":"2021-06-15T16:02:37+00:00","last_pith_cited_at":"2026-05-13T17:56:23+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-14T20:46:11.073542+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":6},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":6},{"context_polarity":"use_method","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","claims":[{"claim_text":"While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple m","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:23:29.967809+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"77befadc-4c56-4673-a37a-30b40d1d27bc","orcid":null,"display_name":"Dosovitskiy"}]},"error":null,"updated_at":"2026-05-13T18:23:29.965349+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T18:23:29.853712+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":55},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":53},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":30},{"title":"Very Deep Convolutional Networks for Large-Scale Image Recognition","work_id":"1c4b4409-c14b-488b-a086-c57a5aab8a29","shared_citers":25},{"title":"DINOv3","work_id":"c8b07deb-8fe7-4e18-9620-f3569d3529ce","shared_citers":22},{"title":"Attention Is All You Need","work_id":"baafb5a2-5272-43bc-932f-09fa9ffe5316","shared_citers":21},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":20},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":19},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":18},{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":17},{"title":"SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features","work_id":"50eec732-2d41-432f-9dcf-ac7fff235ea5","shared_citers":17},{"title":"Layer Normalization","work_id":"20a2d720-0046-4c7c-bcd6-327ec8143f69","shared_citers":16},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":16},{"title":"Auto-Encoding Variational Bayes","work_id":"97d95295-30e1-42b4-bbf6-85f0fa4edb44","shared_citers":15},{"title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","work_id":"4ee75248-1199-492c-a52f-6661e0f4adff","shared_citers":15},{"title":"Representation Learning with Contrastive Predictive Coding","work_id":"7b08a1d4-d565-424e-9c86-6ef244b7b90a","shared_citers":14},{"title":"The Kinetics Human Action Video Dataset","work_id":"c8a3de61-cfd3-4aeb-bcf7-a0372c015748","shared_citers":14},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":13},{"title":"Vision Transformers Need Registers","work_id":"57106da4-5420-4778-94eb-e821589aa7a0","shared_citers":13},{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":12},{"title":"Denoising Diffusion Implicit Models","work_id":"8fa2128b-d18c-405c-ac92-0e669cf89ac0","shared_citers":12},{"title":"Gaussian Error Linear Units (GELUs)","work_id":"0466fd22-03a1-4a61-af0a-a900e77bb023","shared_citers":12},{"title":"GLU Variants Improve Transformer","work_id":"17d0763c-1016-41ab-a478-478e890765eb","shared_citers":12},{"title":"Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model","work_id":"bd81352e-a64f-4720-9f76-ddda0ea9af83","shared_citers":12}],"time_series":[{"n":3,"year":2021},{"n":5,"year":2022},{"n":6,"year":2023},{"n":7,"year":2024},{"n":3,"year":2025},{"n":369,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:56.007823+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T18:23:29.192533+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","claims":[{"claim_text":"While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple m","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:23:29.857242+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","claims":[{"claim_text":"While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple m","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.721283+00:00"}},"summary":{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","claims":[{"claim_text":"While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple m","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":55},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":53},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":30},{"title":"Very Deep Convolutional Networks for Large-Scale Image Recognition","work_id":"1c4b4409-c14b-488b-a086-c57a5aab8a29","shared_citers":25},{"title":"DINOv3","work_id":"c8b07deb-8fe7-4e18-9620-f3569d3529ce","shared_citers":22},{"title":"Attention Is All You Need","work_id":"baafb5a2-5272-43bc-932f-09fa9ffe5316","shared_citers":21},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":20},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":19},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":18},{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":17},{"title":"SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features","work_id":"50eec732-2d41-432f-9dcf-ac7fff235ea5","shared_citers":17},{"title":"Layer Normalization","work_id":"20a2d720-0046-4c7c-bcd6-327ec8143f69","shared_citers":16},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":16},{"title":"Auto-Encoding Variational Bayes","work_id":"97d95295-30e1-42b4-bbf6-85f0fa4edb44","shared_citers":15},{"title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","work_id":"4ee75248-1199-492c-a52f-6661e0f4adff","shared_citers":15},{"title":"Representation Learning with Contrastive Predictive Coding","work_id":"7b08a1d4-d565-424e-9c86-6ef244b7b90a","shared_citers":14},{"title":"The Kinetics Human Action Video Dataset","work_id":"c8a3de61-cfd3-4aeb-bcf7-a0372c015748","shared_citers":14},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":13},{"title":"Vision Transformers Need Registers","work_id":"57106da4-5420-4778-94eb-e821589aa7a0","shared_citers":13},{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":12},{"title":"Denoising Diffusion Implicit Models","work_id":"8fa2128b-d18c-405c-ac92-0e669cf89ac0","shared_citers":12},{"title":"Gaussian Error Linear Units (GELUs)","work_id":"0466fd22-03a1-4a61-af0a-a900e77bb023","shared_citers":12},{"title":"GLU Variants Improve Transformer","work_id":"17d0763c-1016-41ab-a478-478e890765eb","shared_citers":12},{"title":"Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model","work_id":"bd81352e-a64f-4720-9f76-ddda0ea9af83","shared_citers":12}],"time_series":[{"n":3,"year":2021},{"n":5,"year":2022},{"n":6,"year":2023},{"n":7,"year":2024},{"n":3,"year":2025},{"n":369,"year":2026}]},"authors":[{"id":"77befadc-4c56-4673-a37a-30b40d1d27bc","orcid":null,"display_name":"Dosovitskiy","source":"manual","import_confidence":0.72}]}}