{"work":{"id":"19069633-beb6-4e5b-a373-e4becafff7eb","openalex_id":null,"doi":null,"arxiv_id":"2506.20670","raw_key":null,"title":"MMSearch-R1: Incentivizing LMMs to Search","authors":null,"authors_text":"Jinming Wu, Zihao Deng, Wei Li, Yiding Liu, Bo You, Bo Li","year":2025,"venue":"cs.CV","abstract":"Robust deployment of large multimodal models (LMMs) in real-world scenarios requires access to external knowledge sources, given the complexity and dynamic nature of real-world information. Existing approaches such as retrieval-augmented generation (RAG) and prompt engineered search agents rely on rigid pipelines, often leading to inefficient or excessive search behaviors. We present MMSearch-R1, the first end-to-end reinforcement learning framework that enables LMMs to perform on-demand, multi-turn search in real-world Internet environments. Our framework integrates both image and text search tools, allowing the model to reason about when and how to invoke them guided by an outcome-based reward with a search penalty. To support training, We collect a multimodal search VQA dataset through a semi-automated pipeline that covers diverse visual and textual knowledge needs and curate a search-balanced subset with both search-required and search-free samples, which proves essential for shaping efficient and on-demand search behavior. Extensive experiments on knowledge-intensive and info-seeking VQA tasks show that our model not only outperforms RAG-based baselines of the same model size, but also matches the performance of a larger RAG-based model while reducing search calls by over 30%. We further analyze key empirical findings to offer actionable insights for advancing research in multimodal search.","external_url":"https://arxiv.org/abs/2506.20670","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-21T08:49:53.432268+00:00","pith_arxiv_id":"2506.20670","created_at":"2026-05-09T05:55:31.265041+00:00","updated_at":"2026-05-21T08:49:53.432268+00:00","title_quality_ok":true,"display_title":"MMSearch-R1: Incentivizing LMMs to Search","render_title":"MMSearch-R1: Incentivizing LMMs to Search"},"hub":{"state":{"work_id":"19069633-beb6-4e5b-a373-e4becafff7eb","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":20,"external_cited_by_count":null,"distinct_field_count":5,"first_pith_cited_at":"2025-05-28T08:17:57+00:00","last_pith_cited_at":"2026-05-18T07:03:48+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-27T16:17:47.628087+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":8},{"context_role":"baseline","n":2},{"context_role":"dataset","n":2}],"polarity_counts":[{"context_polarity":"background","n":7},{"context_polarity":"baseline","n":2},{"context_polarity":"use_dataset","n":2},{"context_polarity":"unclear","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}