{"work":{"id":"5baeaa33-5986-44a3-85a4-fcabd6fc1e8d","openalex_id":null,"doi":null,"arxiv_id":"2310.03744","raw_key":null,"title":"Improved Baselines with Visual Instruction Tuning","authors":null,"authors_text":"Haotian Liu, Chunyuan Li, Yuheng Li, Yong Jae Lee","year":2023,"venue":"cs.CV","abstract":"Large multimodal models (LMM) have recently shown encouraging progress with visual instruction tuning. In this note, we show that the fully-connected vision-language cross-modal connector in LLaVA is surprisingly powerful and data-efficient. With simple modifications to LLaVA, namely, using CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA data with simple response formatting prompts, we establish stronger baselines that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint uses merely 1.2M publicly available data, and finishes full training in ~1 day on a single 8-A100 node. We hope this can make state-of-the-art LMM research more accessible. Code and model will be publicly available.","external_url":"https://arxiv.org/abs/2310.03744","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-14T23:07:42.882851+00:00","pith_arxiv_id":"2310.03744","created_at":"2026-05-09T19:05:10.225736+00:00","updated_at":"2026-05-14T23:07:42.882851+00:00","title_quality_ok":true,"display_title":"Improved Baselines with Visual Instruction Tuning","render_title":"Improved Baselines with Visual Instruction Tuning"},"hub":{"state":{"work_id":"5baeaa33-5986-44a3-85a4-fcabd6fc1e8d","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":37,"external_cited_by_count":null,"distinct_field_count":5,"first_pith_cited_at":"2023-03-28T17:59:12+00:00","last_pith_cited_at":"2026-05-12T19:25:17+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-14T23:46:21.075985+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":4},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":4},{"context_polarity":"use_method","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T18:20:03.577515+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":14},{"title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models","work_id":"806d2e73-71b3-4d56-87e0-39d571cc15d6","shared_citers":13},{"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","shared_citers":11},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":10},{"title":"Visual Instruction Tuning","work_id":"68be622d-a6dc-4a13-82de-e3054a3dc509","shared_citers":10},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":9},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":9},{"title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning","work_id":"f3aac728-ded0-4e55-aa9e-4a1635d4313d","shared_citers":9},{"title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models","work_id":"a7e3a737-e007-42bc-be89-c4d34c5ee071","shared_citers":9},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":9},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":9},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models","work_id":"63d03f4d-15f4-4583-8286-913c19f02294","shared_citers":7},{"title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","work_id":"7f3bac41-a0a5-4a7a-bfd2-526b616db745","shared_citers":7},{"title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","work_id":"90e2b26a-3d27-4567-86b5-929b582a8034","shared_citers":7},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":6},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":6},{"title":"Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning","work_id":"ba8e8164-e47f-42d6-83ad-696cb57ee79a","shared_citers":6},{"title":"Monkey: Image resolution and text label are important things for large multi-modal models","work_id":"1b51b65b-5659-4d2a-b5b3-0a8ac7f88ed5","shared_citers":6},{"title":"Baichuan 2: Open large-scale language models","work_id":"9ba8f898-3900-4776-b82e-11e767a86ba9","shared_citers":5},{"title":"Cogvlm: Visual expert for pretrained language models","work_id":"0d81fb99-dae6-46d2-8bed-c01dcbd7d7cf","shared_citers":5},{"title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","work_id":"3714835e-c5a6-4d7e-950c-be44670ed9e6","shared_citers":5},{"title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","work_id":"d9e035c7-9e23-4cc2-ad3e-be080fbbf2d9","shared_citers":5},{"title":"Llama-adapter v2: Parameter-efficient visual instruction model","work_id":"0fe2cfd8-d442-4ceb-b1a9-a465704f39b2","shared_citers":5}],"time_series":[{"n":5,"year":2023},{"n":9,"year":2024},{"n":3,"year":2025},{"n":17,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T18:19:55.114242+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T18:19:55.068834+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Improved Baselines with Visual Instruction Tuning","claims":[{"claim_text":"Large multimodal models (LMM) have recently shown encouraging progress with visual instruction tuning. In this note, we show that the fully-connected vision-language cross-modal connector in LLaVA is surprisingly powerful and data-efficient. With simple modifications to LLaVA, namely, using CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA data with simple response formatting prompts, we establish stronger baselines that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint uses merely 1.2M publicly available data, and finishes full training in ~1 ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Improved Baselines with Visual Instruction Tuning because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:19:59.691603+00:00"}},"summary":{"title":"Improved Baselines with Visual Instruction Tuning","claims":[{"claim_text":"Large multimodal models (LMM) have recently shown encouraging progress with visual instruction tuning. In this note, we show that the fully-connected vision-language cross-modal connector in LLaVA is surprisingly powerful and data-efficient. With simple modifications to LLaVA, namely, using CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA data with simple response formatting prompts, we establish stronger baselines that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint uses merely 1.2M publicly available data, and finishes full training in ~1 ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Improved Baselines with Visual Instruction Tuning because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":14},{"title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models","work_id":"806d2e73-71b3-4d56-87e0-39d571cc15d6","shared_citers":13},{"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","shared_citers":11},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":10},{"title":"Visual Instruction Tuning","work_id":"68be622d-a6dc-4a13-82de-e3054a3dc509","shared_citers":10},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":9},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":9},{"title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning","work_id":"f3aac728-ded0-4e55-aa9e-4a1635d4313d","shared_citers":9},{"title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models","work_id":"a7e3a737-e007-42bc-be89-c4d34c5ee071","shared_citers":9},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":9},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":9},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models","work_id":"63d03f4d-15f4-4583-8286-913c19f02294","shared_citers":7},{"title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","work_id":"7f3bac41-a0a5-4a7a-bfd2-526b616db745","shared_citers":7},{"title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","work_id":"90e2b26a-3d27-4567-86b5-929b582a8034","shared_citers":7},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":6},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":6},{"title":"Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning","work_id":"ba8e8164-e47f-42d6-83ad-696cb57ee79a","shared_citers":6},{"title":"Monkey: Image resolution and text label are important things for large multi-modal models","work_id":"1b51b65b-5659-4d2a-b5b3-0a8ac7f88ed5","shared_citers":6},{"title":"Baichuan 2: Open large-scale language models","work_id":"9ba8f898-3900-4776-b82e-11e767a86ba9","shared_citers":5},{"title":"Cogvlm: Visual expert for pretrained language models","work_id":"0d81fb99-dae6-46d2-8bed-c01dcbd7d7cf","shared_citers":5},{"title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","work_id":"3714835e-c5a6-4d7e-950c-be44670ed9e6","shared_citers":5},{"title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","work_id":"d9e035c7-9e23-4cc2-ad3e-be080fbbf2d9","shared_citers":5},{"title":"Llama-adapter v2: Parameter-efficient visual instruction model","work_id":"0fe2cfd8-d442-4ceb-b1a9-a465704f39b2","shared_citers":5}],"time_series":[{"n":5,"year":2023},{"n":9,"year":2024},{"n":3,"year":2025},{"n":17,"year":2026}],"dependency_candidates":[]},"authors":[]}}