{"work":{"id":"78d498ce-11db-4f88-8eb0-40e0f86af615","openalex_id":null,"doi":null,"arxiv_id":"1611.09268","raw_key":null,"title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset","authors":null,"authors_text":"Payal Bajaj, Daniel Campos, Nick Craswell, Li Deng, Jianfeng Gao, Xiaodong Liu","year":2016,"venue":"cs.CL","abstract":"We introduce a large scale MAchine Reading COmprehension dataset, which we name MS MARCO. The dataset comprises of 1,010,916 anonymized questions---sampled from Bing's search query logs---each with a human generated answer and 182,669 completely human rewritten generated answers. In addition, the dataset contains 8,841,823 passages---extracted from 3,563,535 web documents retrieved by Bing---that provide the information necessary for curating the natural language answers. A question in the MS MARCO dataset may have multiple answers or no answers at all. Using this dataset, we propose three different tasks with varying levels of difficulty: (i) predict if a question is answerable given a set of context passages, and extract and synthesize the answer as a human would (ii) generate a well-formed answer (if possible) based on the context passages that can be understood with the question and passage context, and finally (iii) rank a set of retrieved passages given a question. The size of the dataset and the fact that the questions are derived from real user search queries distinguishes MS MARCO from other well-known publicly available datasets for machine reading comprehension and question-answering. We believe that the scale and the real-world nature of this dataset makes it attractive for benchmarking machine reading comprehension and question-answering models.","external_url":"https://arxiv.org/abs/1611.09268","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-14T22:03:02.298399+00:00","pith_arxiv_id":"1611.09268","created_at":"2026-05-10T09:18:32.460137+00:00","updated_at":"2026-05-14T22:03:02.298399+00:00","title_quality_ok":true,"display_title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset","render_title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset"},"hub":{"state":{"work_id":"78d498ce-11db-4f88-8eb0-40e0f86af615","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":37,"external_cited_by_count":null,"distinct_field_count":5,"first_pith_cited_at":"2017-05-09T21:35:07+00:00","last_pith_cited_at":"2026-05-12T04:47:48+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-15T02:36:20.757324+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":1},{"context_role":"dataset","n":1}],"polarity_counts":[{"context_polarity":"background","n":1},{"context_polarity":"use_dataset","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T17:59:27.354904+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models","work_id":"bab684a8-d933-426c-a19e-2c855a0d1f59","shared_citers":6},{"title":"BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models","work_id":"c5f7f027-ac36-4b07-b824-0eca2f310641","shared_citers":5},{"title":"BERT : Pre-training of deep bidirectional transformers for language understanding","work_id":"3e3c8ac8-b858-4b22-af32-393d98c883e0","shared_citers":5},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":5},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":5},{"title":"Towards General Text Embeddings with Multi-stage Contrastive Learning","work_id":"861a61de-66fe-49d1-b1ab-11f8b082a4cc","shared_citers":5},{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":4},{"title":"Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov","work_id":"45551929-96dc-40f3-9f89-10e76731cc24","shared_citers":4},{"title":"Dense passage retrieval for open-domain question answering","work_id":"083391f8-812d-430f-8d08-89a03031ce6c","shared_citers":4},{"title":"FEVER: a large-scale dataset for Fact Extraction and VERification","work_id":"b696f75f-e5ad-4555-9c12-e292e77c388f","shared_citers":4},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":4},{"title":"Text Embeddings by Weakly-Supervised Contrastive Pre-training","work_id":"789cc674-467e-4f23-bb50-05c79fe8c4c2","shared_citers":4},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":3},{"title":"arXiv preprint arXiv:2508.21038 , year=","work_id":"60ea8489-5f5f-4e76-8d00-eba854bf33c9","shared_citers":3},{"title":"CoRRabs/2003.07820(2020), https://arxiv.org/ abs/2003.07820","work_id":"083b288a-95a1-4846-959c-e69b87d8885c","shared_citers":3},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":3},{"title":"Diffusion-pretrained dense and contextual embeddings","work_id":"f7af1c9e-b29d-4882-9d1d-54df00af72d7","shared_citers":3},{"title":"doi: 10.18653/v1/ 2024.findings-acl.586","work_id":"8d675bdd-79ca-48d6-9163-fc17ce0e8ece","shared_citers":3},{"title":"doi: 10.18653/v1/D16-1264","work_id":"8e6a63f7-90ad-4b5e-8493-c26145f74b69","shared_citers":3},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":3},{"title":"Joshi, E","work_id":"d05a9c57-9d88-473a-aa65-efb13f9dee25","shared_citers":3},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":3},{"title":"Passage Re-ranking with BERT","work_id":"562fbfab-d6fe-48e1-a06d-e5d078c70945","shared_citers":3},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":3}],"time_series":[{"n":1,"year":2017},{"n":2,"year":2019},{"n":1,"year":2020},{"n":1,"year":2021},{"n":1,"year":2022},{"n":1,"year":2023},{"n":2,"year":2024},{"n":1,"year":2025},{"n":26,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T17:59:32.320975+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T17:59:32.198325+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset","claims":[{"claim_text":"We introduce a large scale MAchine Reading COmprehension dataset, which we name MS MARCO. The dataset comprises of 1,010,916 anonymized questions---sampled from Bing's search query logs---each with a human generated answer and 182,669 completely human rewritten generated answers. In addition, the dataset contains 8,841,823 passages---extracted from 3,563,535 web documents retrieved by Bing---that provide the information necessary for curating the natural language answers. A question in the MS MARCO dataset may have multiple answers or no answers at all. Using this dataset, we propose three dif","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks MS MARCO: A Human Generated MAchine Reading COmprehension Dataset because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:00:06.524980+00:00"}},"summary":{"title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset","claims":[{"claim_text":"We introduce a large scale MAchine Reading COmprehension dataset, which we name MS MARCO. The dataset comprises of 1,010,916 anonymized questions---sampled from Bing's search query logs---each with a human generated answer and 182,669 completely human rewritten generated answers. In addition, the dataset contains 8,841,823 passages---extracted from 3,563,535 web documents retrieved by Bing---that provide the information necessary for curating the natural language answers. A question in the MS MARCO dataset may have multiple answers or no answers at all. Using this dataset, we propose three dif","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks MS MARCO: A Human Generated MAchine Reading COmprehension Dataset because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models","work_id":"bab684a8-d933-426c-a19e-2c855a0d1f59","shared_citers":6},{"title":"BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models","work_id":"c5f7f027-ac36-4b07-b824-0eca2f310641","shared_citers":5},{"title":"BERT : Pre-training of deep bidirectional transformers for language understanding","work_id":"3e3c8ac8-b858-4b22-af32-393d98c883e0","shared_citers":5},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":5},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":5},{"title":"Towards General Text Embeddings with Multi-stage Contrastive Learning","work_id":"861a61de-66fe-49d1-b1ab-11f8b082a4cc","shared_citers":5},{"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","shared_citers":4},{"title":"Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov","work_id":"45551929-96dc-40f3-9f89-10e76731cc24","shared_citers":4},{"title":"Dense passage retrieval for open-domain question answering","work_id":"083391f8-812d-430f-8d08-89a03031ce6c","shared_citers":4},{"title":"FEVER: a large-scale dataset for Fact Extraction and VERification","work_id":"b696f75f-e5ad-4555-9c12-e292e77c388f","shared_citers":4},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":4},{"title":"Text Embeddings by Weakly-Supervised Contrastive Pre-training","work_id":"789cc674-467e-4f23-bb50-05c79fe8c4c2","shared_citers":4},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":3},{"title":"arXiv preprint arXiv:2508.21038 , year=","work_id":"60ea8489-5f5f-4e76-8d00-eba854bf33c9","shared_citers":3},{"title":"CoRRabs/2003.07820(2020), https://arxiv.org/ abs/2003.07820","work_id":"083b288a-95a1-4846-959c-e69b87d8885c","shared_citers":3},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":3},{"title":"Diffusion-pretrained dense and contextual embeddings","work_id":"f7af1c9e-b29d-4882-9d1d-54df00af72d7","shared_citers":3},{"title":"doi: 10.18653/v1/ 2024.findings-acl.586","work_id":"8d675bdd-79ca-48d6-9163-fc17ce0e8ece","shared_citers":3},{"title":"doi: 10.18653/v1/D16-1264","work_id":"8e6a63f7-90ad-4b5e-8493-c26145f74b69","shared_citers":3},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":3},{"title":"Joshi, E","work_id":"d05a9c57-9d88-473a-aa65-efb13f9dee25","shared_citers":3},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":3},{"title":"Passage Re-ranking with BERT","work_id":"562fbfab-d6fe-48e1-a06d-e5d078c70945","shared_citers":3},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":3}],"time_series":[{"n":1,"year":2017},{"n":2,"year":2019},{"n":1,"year":2020},{"n":1,"year":2021},{"n":1,"year":2022},{"n":1,"year":2023},{"n":2,"year":2024},{"n":1,"year":2025},{"n":26,"year":2026}],"dependency_candidates":[]},"authors":[]}}