{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:VIEYAONDM5LUTKLMMNKATGG3N7","short_pith_number":"pith:VIEYAOND","canonical_record":{"source":{"id":"2404.12390","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-18T17:59:54Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"4d8fd9e1fea6457fae3bc1f04cdd373d055d3fb0b8cdf6f80054724814cfc882","abstract_canon_sha256":"dd25bcb3e35202474023a787b0b9d122840766b9a54178a832f88e9f180d9e66"},"schema_version":"1.0"},"canonical_sha256":"aa098039a3675749a96c63540998db6fc6907ba0875170782140cef6079be0de","source":{"kind":"arxiv","id":"2404.12390","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.12390","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2404.12390v4","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.12390","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"VIEYAONDM5LU","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"VIEYAONDM5LUTKLM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"VIEYAOND","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:VIEYAONDM5LUTKLMMNKATGG3N7","target":"record","payload":{"canonical_record":{"source":{"id":"2404.12390","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-18T17:59:54Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"4d8fd9e1fea6457fae3bc1f04cdd373d055d3fb0b8cdf6f80054724814cfc882","abstract_canon_sha256":"dd25bcb3e35202474023a787b0b9d122840766b9a54178a832f88e9f180d9e66"},"schema_version":"1.0"},"canonical_sha256":"aa098039a3675749a96c63540998db6fc6907ba0875170782140cef6079be0de","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.298491Z","signature_b64":"eqzLJzOtimHKDjaODKdcnBkdv5u3lrlkcWSoKFU0UHn202N2RdZrXAWToiUAjJ/k3S7augmwop5OGA7QIVtPAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"aa098039a3675749a96c63540998db6fc6907ba0875170782140cef6079be0de","last_reissued_at":"2026-05-17T23:38:50.297986Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.297986Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2404.12390","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cAt9NlR4ch++gI6ZeXFbA1nOxLuoXmRv4BSD5PM6YTWeWu+KbmH2oNnnsl5Z7fuPVG6kd0kJvXciCs/YDDpKBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T07:09:59.614798Z"},"content_sha256":"908f1f39c87ebe1984ec2d4afe1ad99bc633697fbeb35bc55211c6c59130db03","schema_version":"1.0","event_id":"sha256:908f1f39c87ebe1984ec2d4afe1ad99bc633697fbeb35bc55211c6c59130db03"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:VIEYAONDM5LUTKLMMNKATGG3N7","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"Multimodal LLMs like GPT-4V reach only 51% accuracy on visual perception tasks that humans solve at 96%.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Bangzheng Li, Dan Roth, Haoyu Wang, Noah A. Smith, Ranjay Krishna, Wei-Chiu Ma, Xingyu Fu, Xudong Lin, Yu Feng, Yushi Hu","submitted_at":"2024-04-18T17:59:54Z","abstract_excerpt":"We introduce Blink, a new benchmark for multimodal language models (LLMs) that focuses on core visual perception abilities not found in other evaluations. Most of the Blink tasks can be solved by humans \"within a blink\" (e.g., relative depth estimation, visual correspondence, forensics detection, and multi-view reasoning). However, we find these perception-demanding tasks cast significant challenges for current multimodal LLMs because they resist mediation through natural language. Blink reformats 14 classic computer vision tasks into 3,807 multiple-choice questions, paired with single or mult"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"even the best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only 13.17% and 7.63% higher than random guessing, indicating that such perception abilities have not emerged yet in recent multimodal LLMs","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the selected tasks genuinely require visual perception that cannot be solved through language patterns or statistical shortcuts in the training data.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"BLINK benchmark shows multimodal LLMs reach only 45-51 percent accuracy on core visual perception tasks where humans achieve 95 percent, indicating these abilities have not emerged.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Multimodal LLMs like GPT-4V reach only 51% accuracy on visual perception tasks that humans solve at 96%.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"2917149b3d2dde9ce6f177db5480221f03644168e304eef6bdde5e78bf6798a4"},"source":{"id":"2404.12390","kind":"arxiv","version":4},"verdict":{"id":"8e751e7c-b924-40d4-95b3-a023e85b974f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T20:14:26.616449Z","strongest_claim":"even the best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only 13.17% and 7.63% higher than random guessing, indicating that such perception abilities have not emerged yet in recent multimodal LLMs","one_line_summary":"BLINK benchmark shows multimodal LLMs reach only 45-51 percent accuracy on core visual perception tasks where humans achieve 95 percent, indicating these abilities have not emerged.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the selected tasks genuinely require visual perception that cannot be solved through language patterns or statistical shortcuts in the training data.","pith_extraction_headline":"Multimodal LLMs like GPT-4V reach only 51% accuracy on visual perception tasks that humans solve at 96%."},"references":{"count":90,"sample":[{"doi":"","year":2024,"title":"Introducing the next generation of claude.https://www.anthropic.com/news/ claude-3-family (March 2024) 11, 12, 23, 24","work_id":"f8bea833-ebe2-4366-aa34-0ae3433f6dc7","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"In: AAAI (2019) 10","work_id":"365c08d1-df9d-4bc5-859b-42eb662a253d","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Advances in Neural Information Processing Systems35, 23716–23736 (2022) 2, 4, 22","work_id":"88ba30a1-83ea-4142-9669-25cd7d6dffa5","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2015,"title":"In: Proceedings of the IEEE international conference on computer vision","work_id":"a8727b08-59c4-43f9-b90a-a33fa63ffb0b","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models","work_id":"87bfa84a-e663-4165-806f-93ef439d88d0","ref_index":5,"cited_arxiv_id":"2308.01390","is_internal_anchor":true}],"resolved_work":90,"snapshot_sha256":"0ff96068e0e13eefed46d8bbcc5691567336e60abbd40ed9842bf8fa961171ee","internal_anchors":20},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b1aea25256854ddbeeb2d9df367d438391cb3695d8baf28238790b470a4e8fb0"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"8e751e7c-b924-40d4-95b3-a023e85b974f"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"0lhWSlzMB3XEzj1C0PksyuiF2ojirz4qc4x+5NaU+ZFhL3gdZdIFkL/mrICtbqXKSnlXW4RIyiSkg6JRz3LOAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T07:09:59.615977Z"},"content_sha256":"ee10ffa15956469f2017986423b6e15f069218bce13116f2a602ff2a09786430","schema_version":"1.0","event_id":"sha256:ee10ffa15956469f2017986423b6e15f069218bce13116f2a602ff2a09786430"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VIEYAONDM5LUTKLMMNKATGG3N7/bundle.json","state_url":"https://pith.science/pith/VIEYAONDM5LUTKLMMNKATGG3N7/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VIEYAONDM5LUTKLMMNKATGG3N7/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-24T07:09:59Z","links":{"resolver":"https://pith.science/pith/VIEYAONDM5LUTKLMMNKATGG3N7","bundle":"https://pith.science/pith/VIEYAONDM5LUTKLMMNKATGG3N7/bundle.json","state":"https://pith.science/pith/VIEYAONDM5LUTKLMMNKATGG3N7/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VIEYAONDM5LUTKLMMNKATGG3N7/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:VIEYAONDM5LUTKLMMNKATGG3N7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"dd25bcb3e35202474023a787b0b9d122840766b9a54178a832f88e9f180d9e66","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-18T17:59:54Z","title_canon_sha256":"4d8fd9e1fea6457fae3bc1f04cdd373d055d3fb0b8cdf6f80054724814cfc882"},"schema_version":"1.0","source":{"id":"2404.12390","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.12390","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2404.12390v4","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.12390","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"VIEYAONDM5LU","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"VIEYAONDM5LUTKLM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"VIEYAOND","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:ee10ffa15956469f2017986423b6e15f069218bce13116f2a602ff2a09786430","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"even the best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only 13.17% and 7.63% higher than random guessing, indicating that such perception abilities have not emerged yet in recent multimodal LLMs"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the selected tasks genuinely require visual perception that cannot be solved through language patterns or statistical shortcuts in the training data."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"BLINK benchmark shows multimodal LLMs reach only 45-51 percent accuracy on core visual perception tasks where humans achieve 95 percent, indicating these abilities have not emerged."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Multimodal LLMs like GPT-4V reach only 51% accuracy on visual perception tasks that humans solve at 96%."}],"snapshot_sha256":"2917149b3d2dde9ce6f177db5480221f03644168e304eef6bdde5e78bf6798a4"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b1aea25256854ddbeeb2d9df367d438391cb3695d8baf28238790b470a4e8fb0"},"paper":{"abstract_excerpt":"We introduce Blink, a new benchmark for multimodal language models (LLMs) that focuses on core visual perception abilities not found in other evaluations. Most of the Blink tasks can be solved by humans \"within a blink\" (e.g., relative depth estimation, visual correspondence, forensics detection, and multi-view reasoning). However, we find these perception-demanding tasks cast significant challenges for current multimodal LLMs because they resist mediation through natural language. Blink reformats 14 classic computer vision tasks into 3,807 multiple-choice questions, paired with single or mult","authors_text":"Bangzheng Li, Dan Roth, Haoyu Wang, Noah A. Smith, Ranjay Krishna, Wei-Chiu Ma, Xingyu Fu, Xudong Lin, Yu Feng, Yushi Hu","cross_cats":["cs.AI","cs.CL"],"headline":"Multimodal LLMs like GPT-4V reach only 51% accuracy on visual perception tasks that humans solve at 96%.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-18T17:59:54Z","title":"BLINK: Multimodal Large Language Models Can See but Not Perceive"},"references":{"count":90,"internal_anchors":20,"resolved_work":90,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Introducing the next generation of claude.https://www.anthropic.com/news/ claude-3-family (March 2024) 11, 12, 23, 24","work_id":"f8bea833-ebe2-4366-aa34-0ae3433f6dc7","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"In: AAAI (2019) 10","work_id":"365c08d1-df9d-4bc5-859b-42eb662a253d","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Advances in Neural Information Processing Systems35, 23716–23736 (2022) 2, 4, 22","work_id":"88ba30a1-83ea-4142-9669-25cd7d6dffa5","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"In: Proceedings of the IEEE international conference on computer vision","work_id":"a8727b08-59c4-43f9-b90a-a33fa63ffb0b","year":2015},{"cited_arxiv_id":"2308.01390","doi":"","is_internal_anchor":true,"ref_index":5,"title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models","work_id":"87bfa84a-e663-4165-806f-93ef439d88d0","year":2023}],"snapshot_sha256":"0ff96068e0e13eefed46d8bbcc5691567336e60abbd40ed9842bf8fa961171ee"},"source":{"id":"2404.12390","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T20:14:26.616449Z","id":"8e751e7c-b924-40d4-95b3-a023e85b974f","model_set":{"reader":"grok-4.3"},"one_line_summary":"BLINK benchmark shows multimodal LLMs reach only 45-51 percent accuracy on core visual perception tasks where humans achieve 95 percent, indicating these abilities have not emerged.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Multimodal LLMs like GPT-4V reach only 51% accuracy on visual perception tasks that humans solve at 96%.","strongest_claim":"even the best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only 13.17% and 7.63% higher than random guessing, indicating that such perception abilities have not emerged yet in recent multimodal LLMs","weakest_assumption":"That the selected tasks genuinely require visual perception that cannot be solved through language patterns or statistical shortcuts in the training data."}},"verdict_id":"8e751e7c-b924-40d4-95b3-a023e85b974f"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:908f1f39c87ebe1984ec2d4afe1ad99bc633697fbeb35bc55211c6c59130db03","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"dd25bcb3e35202474023a787b0b9d122840766b9a54178a832f88e9f180d9e66","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-18T17:59:54Z","title_canon_sha256":"4d8fd9e1fea6457fae3bc1f04cdd373d055d3fb0b8cdf6f80054724814cfc882"},"schema_version":"1.0","source":{"id":"2404.12390","kind":"arxiv","version":4}},"canonical_sha256":"aa098039a3675749a96c63540998db6fc6907ba0875170782140cef6079be0de","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"aa098039a3675749a96c63540998db6fc6907ba0875170782140cef6079be0de","first_computed_at":"2026-05-17T23:38:50.297986Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.297986Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"eqzLJzOtimHKDjaODKdcnBkdv5u3lrlkcWSoKFU0UHn202N2RdZrXAWToiUAjJ/k3S7augmwop5OGA7QIVtPAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.298491Z","signed_message":"canonical_sha256_bytes"},"source_id":"2404.12390","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:908f1f39c87ebe1984ec2d4afe1ad99bc633697fbeb35bc55211c6c59130db03","sha256:ee10ffa15956469f2017986423b6e15f069218bce13116f2a602ff2a09786430"],"state_sha256":"f39cf396f4c9cb390d935b454208372a72d8cb0c26258e78f57eff68281f07f0"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"GKuylKE8GWAMA2PtfAwA3PJAiqKjWY4i7QXyxVmlhgNsYo9iYuIi9Izi8uTLkYEZqbx6vpYrjA4W2ss9co8ICA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-24T07:09:59.620673Z","bundle_sha256":"284c35c78361e3e319b7c8a77a4c229927b530f7bd7e75ab5ad84b69d9aa6815"}}