{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:HLF232VGEZJWT5RA77GESCH7GS","short_pith_number":"pith:HLF232VG","canonical_record":{"source":{"id":"2606.28551","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:11:29Z","cross_cats_sorted":["cs.CL","cs.LG"],"title_canon_sha256":"59a6bb6b0e124f4bf4ee62186971ec51a1cdc066348c6a548222a2cd0d5789ea","abstract_canon_sha256":"5f85da6d14ae184609fd9774d0ec30483956962f754d434010552123fce9b018"},"schema_version":"1.0"},"canonical_sha256":"3acbadeaa6265369f620ffcc4908ff34aaee023861203f2867c6caefd47b16ed","source":{"kind":"arxiv","id":"2606.28551","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.28551","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"arxiv_version","alias_value":"2606.28551v1","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.28551","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"pith_short_12","alias_value":"HLF232VGEZJW","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"pith_short_16","alias_value":"HLF232VGEZJWT5RA","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"pith_short_8","alias_value":"HLF232VG","created_at":"2026-06-30T00:15:17Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:HLF232VGEZJWT5RA77GESCH7GS","target":"record","payload":{"canonical_record":{"source":{"id":"2606.28551","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:11:29Z","cross_cats_sorted":["cs.CL","cs.LG"],"title_canon_sha256":"59a6bb6b0e124f4bf4ee62186971ec51a1cdc066348c6a548222a2cd0d5789ea","abstract_canon_sha256":"5f85da6d14ae184609fd9774d0ec30483956962f754d434010552123fce9b018"},"schema_version":"1.0"},"canonical_sha256":"3acbadeaa6265369f620ffcc4908ff34aaee023861203f2867c6caefd47b16ed","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T00:15:17.829795Z","signature_b64":"Q57k57HnZdX1HgEMbM0V7Iw3EK5Nnt+FTr12XVNl/C1p/Sy9VKoJl0GEKfe7zcStDLkCagJjusIjGO7prHV2AA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3acbadeaa6265369f620ffcc4908ff34aaee023861203f2867c6caefd47b16ed","last_reissued_at":"2026-06-30T00:15:17.829387Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T00:15:17.829387Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.28551","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-30T00:15:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"zHY+q5NezHsJ91Kj7EFPmwFNGQ5llelKkKCCT0+niqSxMnBiDNoDF2b41IbLaLPOl4aC3x3++Frwj676s9Z5DQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T09:51:32.149107Z"},"content_sha256":"6ffeb6bf8ce75f3f456b546cce90b145cb0433d849eff7df102b1b0478526de2","schema_version":"1.0","event_id":"sha256:6ffeb6bf8ce75f3f456b546cce90b145cb0433d849eff7df102b1b0478526de2"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:HLF232VGEZJWT5RA77GESCH7GS","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DataComp-VLM: Improved Open Datasets for Vision-Language Models","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.CL","cs.LG"],"primary_cat":"cs.CV","authors_text":"Adhiraj Ghosh, Alessio Tonioni, Ameya Prabhu, Ana Klimovic, Andreas Hochlehnert, Bernt Schiele, Dhruba Ghosh, Elaine Sui, Elisa Ricci, Federico Tombari, Hasan Hammoud, Hilde Kuehne, Jehanzeb Mirza, Jenia Jitsev, Joschka Struber, Karsten Roth, Ludwig Schmidt, Marianna Nezhurina, Massimiliano Mancini, Matteo Farina, Matthias Bethge, Maximilian B\\\"other, Mehdi Cherti, Muhammad Ferjad Naeem, Nikhil Parthasarathy, Sebastian Dziadzio, Sedrick Keh, Selim Kuzucu, Serena Yeung-Levy, Sewoong Oh, Simone Caldarella, Soumya Jahagirdar, Thao Nguyen, Thomas De Min, Vishaal Udandarao, Yuhui Zhang","submitted_at":"2026-06-26T19:11:29Z","abstract_excerpt":"Building performant Vision-Language Models (VLMs) requires carefully curating large-scale training datasets, yet the community lacks systematic benchmarks for evaluating such curation strategies. We introduce DataComp for VLMs (DCVLM), a benchmark for controlled data-centric experiments to improve VLM training. As part of DCVLM, we collect 160 datasets spanning four data types -- image-caption pairs, multimodal interleaved documents, text-only, and instruction-tuning data -- into a corpus of 6T multimodal tokens. DCVLM allows participants to test curation strategies (filtering, mixing, formatt"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.28551","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.28551/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-30T00:15:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"akxSNeBsjkkXUgbtABMkbXsf1xWPHsrEsilrEX3cxIfC1JzQ08tyzAdWGFwUw6dheBWqmnYndoa9MgEw9xDKDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T09:51:32.149493Z"},"content_sha256":"5faa256bc360977dfb4f497d5bd50b77759edc0c0c568c37847fdbe7b1779d38","schema_version":"1.0","event_id":"sha256:5faa256bc360977dfb4f497d5bd50b77759edc0c0c568c37847fdbe7b1779d38"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/HLF232VGEZJWT5RA77GESCH7GS/bundle.json","state_url":"https://pith.science/pith/HLF232VGEZJWT5RA77GESCH7GS/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/HLF232VGEZJWT5RA77GESCH7GS/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-30T09:51:32Z","links":{"resolver":"https://pith.science/pith/HLF232VGEZJWT5RA77GESCH7GS","bundle":"https://pith.science/pith/HLF232VGEZJWT5RA77GESCH7GS/bundle.json","state":"https://pith.science/pith/HLF232VGEZJWT5RA77GESCH7GS/state.json","well_known_bundle":"https://pith.science/.well-known/pith/HLF232VGEZJWT5RA77GESCH7GS/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:HLF232VGEZJWT5RA77GESCH7GS","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5f85da6d14ae184609fd9774d0ec30483956962f754d434010552123fce9b018","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:11:29Z","title_canon_sha256":"59a6bb6b0e124f4bf4ee62186971ec51a1cdc066348c6a548222a2cd0d5789ea"},"schema_version":"1.0","source":{"id":"2606.28551","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.28551","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"arxiv_version","alias_value":"2606.28551v1","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.28551","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"pith_short_12","alias_value":"HLF232VGEZJW","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"pith_short_16","alias_value":"HLF232VGEZJWT5RA","created_at":"2026-06-30T00:15:17Z"},{"alias_kind":"pith_short_8","alias_value":"HLF232VG","created_at":"2026-06-30T00:15:17Z"}],"graph_snapshots":[{"event_id":"sha256:5faa256bc360977dfb4f497d5bd50b77759edc0c0c568c37847fdbe7b1779d38","target":"graph","created_at":"2026-06-30T00:15:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.28551/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Building performant Vision-Language Models (VLMs) requires carefully curating large-scale training datasets, yet the community lacks systematic benchmarks for evaluating such curation strategies. We introduce DataComp for VLMs (DCVLM), a benchmark for controlled data-centric experiments to improve VLM training. As part of DCVLM, we collect 160 datasets spanning four data types -- image-caption pairs, multimodal interleaved documents, text-only, and instruction-tuning data -- into a corpus of 6T multimodal tokens. DCVLM allows participants to test curation strategies (filtering, mixing, formatt","authors_text":"Adhiraj Ghosh, Alessio Tonioni, Ameya Prabhu, Ana Klimovic, Andreas Hochlehnert, Bernt Schiele, Dhruba Ghosh, Elaine Sui, Elisa Ricci, Federico Tombari, Hasan Hammoud, Hilde Kuehne, Jehanzeb Mirza, Jenia Jitsev, Joschka Struber, Karsten Roth, Ludwig Schmidt, Marianna Nezhurina, Massimiliano Mancini, Matteo Farina, Matthias Bethge, Maximilian B\\\"other, Mehdi Cherti, Muhammad Ferjad Naeem, Nikhil Parthasarathy, Sebastian Dziadzio, Sedrick Keh, Selim Kuzucu, Serena Yeung-Levy, Sewoong Oh, Simone Caldarella, Soumya Jahagirdar, Thao Nguyen, Thomas De Min, Vishaal Udandarao, Yuhui Zhang","cross_cats":["cs.CL","cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:11:29Z","title":"DataComp-VLM: Improved Open Datasets for Vision-Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.28551","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6ffeb6bf8ce75f3f456b546cce90b145cb0433d849eff7df102b1b0478526de2","target":"record","created_at":"2026-06-30T00:15:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5f85da6d14ae184609fd9774d0ec30483956962f754d434010552123fce9b018","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:11:29Z","title_canon_sha256":"59a6bb6b0e124f4bf4ee62186971ec51a1cdc066348c6a548222a2cd0d5789ea"},"schema_version":"1.0","source":{"id":"2606.28551","kind":"arxiv","version":1}},"canonical_sha256":"3acbadeaa6265369f620ffcc4908ff34aaee023861203f2867c6caefd47b16ed","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3acbadeaa6265369f620ffcc4908ff34aaee023861203f2867c6caefd47b16ed","first_computed_at":"2026-06-30T00:15:17.829387Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-30T00:15:17.829387Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Q57k57HnZdX1HgEMbM0V7Iw3EK5Nnt+FTr12XVNl/C1p/Sy9VKoJl0GEKfe7zcStDLkCagJjusIjGO7prHV2AA==","signature_status":"signed_v1","signed_at":"2026-06-30T00:15:17.829795Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.28551","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6ffeb6bf8ce75f3f456b546cce90b145cb0433d849eff7df102b1b0478526de2","sha256:5faa256bc360977dfb4f497d5bd50b77759edc0c0c568c37847fdbe7b1779d38"],"state_sha256":"0e49444cea744fbe0e80d31976cbd89af261b351edce097d3994a3c2de3743ac"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"8Zs4kFcnaqxqA3qBZboZZ/bw0ZiI+/qB6t7UKH2j10X8T3kHOAGDFsVA25tpz6c3bFCbcInCs/7UeCt5Oa6fDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-30T09:51:32.151446Z","bundle_sha256":"0a1bc0b14b7fb8c4df0b730f394579e53482c5a921ac792c6a64edd679671ca1"}}