{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:7L2LCIDIL22PSJFQG4OCMYKWBU","short_pith_number":"pith:7L2LCIDI","canonical_record":{"source":{"id":"2307.06942","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-13T17:58:32Z","cross_cats_sorted":[],"title_canon_sha256":"ed56d583d0a3dc7471844fffe8f1ec2c462996e58f23d6c646e929ebea61ff5e","abstract_canon_sha256":"66fa1d6696b6cee9acb169649d8eae60a2ccf732088357b099f5377e1f284a88"},"schema_version":"1.0"},"canonical_sha256":"faf4b120685eb4f924b0371c2661560d14b994c3b8f88f9e5423bea254dd3710","source":{"kind":"arxiv","id":"2307.06942","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2307.06942","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2307.06942v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.06942","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"7L2LCIDIL22P","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"7L2LCIDIL22PSJFQ","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"7L2LCIDI","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:7L2LCIDIL22PSJFQG4OCMYKWBU","target":"record","payload":{"canonical_record":{"source":{"id":"2307.06942","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-13T17:58:32Z","cross_cats_sorted":[],"title_canon_sha256":"ed56d583d0a3dc7471844fffe8f1ec2c462996e58f23d6c646e929ebea61ff5e","abstract_canon_sha256":"66fa1d6696b6cee9acb169649d8eae60a2ccf732088357b099f5377e1f284a88"},"schema_version":"1.0"},"canonical_sha256":"faf4b120685eb4f924b0371c2661560d14b994c3b8f88f9e5423bea254dd3710","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.259998Z","signature_b64":"ATooeu/8BbPC/QpN1+J2HXZ7GzHc2yopAGc/l569k14FCex/NFhC1S9UD9o0SbcqGBkRDIyXfljABWvO5PYoBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"faf4b120685eb4f924b0371c2661560d14b994c3b8f88f9e5423bea254dd3710","last_reissued_at":"2026-05-17T23:38:53.259393Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.259393Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2307.06942","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"DMRLZ5/l71LuxAFCfeSFItWRMOdEMoH+i5AMwhPc+VYtUdSUa/UV7PoT6FfIJJpiORhP3/000KUu8CuXB6KQBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T02:04:01.854395Z"},"content_sha256":"a8d77be41af20339cbb1f3a2865cf8650e90c0c27acdb62f82040fd54eeb2584","schema_version":"1.0","event_id":"sha256:a8d77be41af20339cbb1f3a2865cf8650e90c0c27acdb62f82040fd54eeb2584"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:7L2LCIDIL22PSJFQG4OCMYKWBU","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Conghui He, Guo Chen, Jiashuo Yu, Kunchang Li, Limin Wang, Ping Luo, Xinhao Li, Xin Ma, Xinyuan Chen, Yali Wang, Yaohui Wang, Yinan He, Yi Wang, Yizhuo Li, Yu Qiao, Ziwei Liu","submitted_at":"2023-07-13T17:58:32Z","abstract_excerpt":"This paper introduces InternVid, a large-scale video-centric multimodal dataset that enables learning powerful and transferable video-text representations for multimodal understanding and generation. The InternVid dataset contains over 7 million videos lasting nearly 760K hours, yielding 234M video clips accompanied by detailed descriptions of total 4.1B words. Our core contribution is to develop a scalable approach to autonomously build a high-quality video-text dataset with large language models (LLM), thereby showcasing its efficacy in learning video-language representation at scale. Specif"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Learned on InternVid via contrastive learning, this model demonstrates leading zero-shot action recognition and competitive video retrieval performance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The multi-scale LLM-generated descriptions are sufficiently accurate and diverse to produce transferable video-text representations without introducing systematic biases or hallucinations that degrade downstream performance.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"InternVid supplies 7M videos and LLM captions to train ViCLIP, which reaches leading zero-shot action recognition and competitive retrieval performance.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"8b2cd0744bf0778d07ceacb7d7a7aa71e00906bb41fbb6ecccde7bc4a5c493f8"},"source":{"id":"2307.06942","kind":"arxiv","version":2},"verdict":{"id":"d1526bfd-4cd9-4a2d-b042-3a4fe0929739","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T06:25:54.338059Z","strongest_claim":"Learned on InternVid via contrastive learning, this model demonstrates leading zero-shot action recognition and competitive video retrieval performance.","one_line_summary":"InternVid supplies 7M videos and LLM captions to train ViCLIP, which reaches leading zero-shot action recognition and competitive retrieval performance.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The multi-scale LLM-generated descriptions are sufficiently accurate and diverse to produce transferable video-text representations without introducing systematic biases or hallucinations that degrade downstream performance.","pith_extraction_headline":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition."},"references":{"count":82,"sample":[{"doi":"","year":1901,"title":"Language models are few-shot learners","work_id":"b50c9b32-76fe-43d5-b25e-cb27d397e9fd","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"Howto100m: Learning a text-video embedding by watching hundred million narrated video clips","work_id":"4dbd9fea-5e22-428e-af10-fffaa570fb86","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Advancing high-resolution video-language representation with large-scale video transcriptions","work_id":"555e923f-999c-40c6-858a-04eb595d89e2","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Merlot: Multimodal neural script knowledge models","work_id":"cb6de4ab-804d-4ccc-af6b-0ccd5fcd09c9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Merlot reserve: Neural script knowledge through vision and language and sound","work_id":"6b00c0e3-0506-4e5c-a0fb-2a1e20c58aef","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":82,"snapshot_sha256":"a88efe24071128222f0819d789630685a1cab4688d2d5cc5735df371f8f7842e","internal_anchors":13},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f7f29e802f0c2d67d186bd138e08c6024a5b4da3f85136781187be6252d36b1b"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"d1526bfd-4cd9-4a2d-b042-3a4fe0929739"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"m8Jx0Jz/olTfNU9KBljiyB9dooqkpdX79zzwVh05EpFcAZeH44yxym9lX9+Jao3o0W/qS83JVXRzD9ZNZMvOAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T02:04:01.855536Z"},"content_sha256":"03619f6e29c0eef38b6eb2bd2c700549b1c9f3bd253bef133b4c3a0a3608f293","schema_version":"1.0","event_id":"sha256:03619f6e29c0eef38b6eb2bd2c700549b1c9f3bd253bef133b4c3a0a3608f293"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/bundle.json","state_url":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T02:04:01Z","links":{"resolver":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU","bundle":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/bundle.json","state":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:7L2LCIDIL22PSJFQG4OCMYKWBU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"66fa1d6696b6cee9acb169649d8eae60a2ccf732088357b099f5377e1f284a88","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-13T17:58:32Z","title_canon_sha256":"ed56d583d0a3dc7471844fffe8f1ec2c462996e58f23d6c646e929ebea61ff5e"},"schema_version":"1.0","source":{"id":"2307.06942","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2307.06942","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2307.06942v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.06942","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"7L2LCIDIL22P","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"7L2LCIDIL22PSJFQ","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"7L2LCIDI","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:03619f6e29c0eef38b6eb2bd2c700549b1c9f3bd253bef133b4c3a0a3608f293","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Learned on InternVid via contrastive learning, this model demonstrates leading zero-shot action recognition and competitive video retrieval performance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The multi-scale LLM-generated descriptions are sufficiently accurate and diverse to produce transferable video-text representations without introducing systematic biases or hallucinations that degrade downstream performance."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"InternVid supplies 7M videos and LLM captions to train ViCLIP, which reaches leading zero-shot action recognition and competitive retrieval performance."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition."}],"snapshot_sha256":"8b2cd0744bf0778d07ceacb7d7a7aa71e00906bb41fbb6ecccde7bc4a5c493f8"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f7f29e802f0c2d67d186bd138e08c6024a5b4da3f85136781187be6252d36b1b"},"paper":{"abstract_excerpt":"This paper introduces InternVid, a large-scale video-centric multimodal dataset that enables learning powerful and transferable video-text representations for multimodal understanding and generation. The InternVid dataset contains over 7 million videos lasting nearly 760K hours, yielding 234M video clips accompanied by detailed descriptions of total 4.1B words. Our core contribution is to develop a scalable approach to autonomously build a high-quality video-text dataset with large language models (LLM), thereby showcasing its efficacy in learning video-language representation at scale. Specif","authors_text":"Conghui He, Guo Chen, Jiashuo Yu, Kunchang Li, Limin Wang, Ping Luo, Xinhao Li, Xin Ma, Xinyuan Chen, Yali Wang, Yaohui Wang, Yinan He, Yi Wang, Yizhuo Li, Yu Qiao, Ziwei Liu","cross_cats":[],"headline":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-13T17:58:32Z","title":"InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation"},"references":{"count":82,"internal_anchors":13,"resolved_work":82,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Language models are few-shot learners","work_id":"b50c9b32-76fe-43d5-b25e-cb27d397e9fd","year":1901},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Howto100m: Learning a text-video embedding by watching hundred million narrated video clips","work_id":"4dbd9fea-5e22-428e-af10-fffaa570fb86","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Advancing high-resolution video-language representation with large-scale video transcriptions","work_id":"555e923f-999c-40c6-858a-04eb595d89e2","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Merlot: Multimodal neural script knowledge models","work_id":"cb6de4ab-804d-4ccc-af6b-0ccd5fcd09c9","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Merlot reserve: Neural script knowledge through vision and language and sound","work_id":"6b00c0e3-0506-4e5c-a0fb-2a1e20c58aef","year":2022}],"snapshot_sha256":"a88efe24071128222f0819d789630685a1cab4688d2d5cc5735df371f8f7842e"},"source":{"id":"2307.06942","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T06:25:54.338059Z","id":"d1526bfd-4cd9-4a2d-b042-3a4fe0929739","model_set":{"reader":"grok-4.3"},"one_line_summary":"InternVid supplies 7M videos and LLM captions to train ViCLIP, which reaches leading zero-shot action recognition and competitive retrieval performance.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition.","strongest_claim":"Learned on InternVid via contrastive learning, this model demonstrates leading zero-shot action recognition and competitive video retrieval performance.","weakest_assumption":"The multi-scale LLM-generated descriptions are sufficiently accurate and diverse to produce transferable video-text representations without introducing systematic biases or hallucinations that degrade downstream performance."}},"verdict_id":"d1526bfd-4cd9-4a2d-b042-3a4fe0929739"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a8d77be41af20339cbb1f3a2865cf8650e90c0c27acdb62f82040fd54eeb2584","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"66fa1d6696b6cee9acb169649d8eae60a2ccf732088357b099f5377e1f284a88","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-13T17:58:32Z","title_canon_sha256":"ed56d583d0a3dc7471844fffe8f1ec2c462996e58f23d6c646e929ebea61ff5e"},"schema_version":"1.0","source":{"id":"2307.06942","kind":"arxiv","version":2}},"canonical_sha256":"faf4b120685eb4f924b0371c2661560d14b994c3b8f88f9e5423bea254dd3710","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"faf4b120685eb4f924b0371c2661560d14b994c3b8f88f9e5423bea254dd3710","first_computed_at":"2026-05-17T23:38:53.259393Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.259393Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ATooeu/8BbPC/QpN1+J2HXZ7GzHc2yopAGc/l569k14FCex/NFhC1S9UD9o0SbcqGBkRDIyXfljABWvO5PYoBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.259998Z","signed_message":"canonical_sha256_bytes"},"source_id":"2307.06942","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a8d77be41af20339cbb1f3a2865cf8650e90c0c27acdb62f82040fd54eeb2584","sha256:03619f6e29c0eef38b6eb2bd2c700549b1c9f3bd253bef133b4c3a0a3608f293"],"state_sha256":"d4dbea28b53ce724b78741d560d31fb575e7541027694bc9901d1f71a92bc41d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1WxieTjjVPwJRaXyOVbCEQOLhBmcV07emaCNO6S0DbMHNorSj/x9Kh6OyxRAr8EGVqTuWRPa4iUhImVs6PC8Bw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T02:04:01.860851Z","bundle_sha256":"242e94f7ec9d4c6c2fd697acf302546016bff757c38764d6bfd5d5dff5fd38a3"}}