{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:QT2OC6LCZXLTZ47EOLBAFJSMSP","short_pith_number":"pith:QT2OC6LC","canonical_record":{"source":{"id":"2310.01852","kind":"arxiv","version":7},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-03T07:33:27Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"7e6e9944d2ba0e8600d523445e1059096ea4896ecd671d4aa18fb88ebe4f6991","abstract_canon_sha256":"a89d3f1b0310aacfd85d336f1ea4af2f879bd89dae84a6100731d407e7c398cc"},"schema_version":"1.0"},"canonical_sha256":"84f4e17962cdd73cf3e472c202a64c93dc2fd8ff7032b1327a7be4af869fdfc7","source":{"kind":"arxiv","id":"2310.01852","version":7},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2310.01852","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2310.01852v7","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.01852","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"QT2OC6LCZXLT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"QT2OC6LCZXLTZ47E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"QT2OC6LC","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:QT2OC6LCZXLTZ47EOLBAFJSMSP","target":"record","payload":{"canonical_record":{"source":{"id":"2310.01852","kind":"arxiv","version":7},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-03T07:33:27Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"7e6e9944d2ba0e8600d523445e1059096ea4896ecd671d4aa18fb88ebe4f6991","abstract_canon_sha256":"a89d3f1b0310aacfd85d336f1ea4af2f879bd89dae84a6100731d407e7c398cc"},"schema_version":"1.0"},"canonical_sha256":"84f4e17962cdd73cf3e472c202a64c93dc2fd8ff7032b1327a7be4af869fdfc7","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:15.256300Z","signature_b64":"yjwH6st+4N/Po4Wr1RrfXKYTaCWB1bUBg1meR6gPj6OIbipkDYUcOProk607GjQ1XZXu7BR4KvOewjYhzH/kDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"84f4e17962cdd73cf3e472c202a64c93dc2fd8ff7032b1327a7be4af869fdfc7","last_reissued_at":"2026-05-17T23:38:15.255731Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:15.255731Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2310.01852","source_version":7,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"3A6qcw2BvI0Jy9Z6A15yTZg+xFQXy+/YQkbghZPKD0Eb1YGcYyqxFW14EbGXTeW/M/vrDrcaa6ETa3moGXG2AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T21:55:02.845961Z"},"content_sha256":"8049511d5f2209b67e58f6533cfb3d8095911fee4de948704db2523a535b49e4","schema_version":"1.0","event_id":"sha256:8049511d5f2209b67e58f6533cfb3d8095911fee4de948704db2523a535b49e4"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:QT2OC6LCZXLTZ47EOLBAFJSMSP","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Language serves as a semantic anchor to align video, audio, depth, and infrared into one shared feature space.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bin Lin, Bin Zhu, Hongfa Wang, Jiaxi Cui, Junwu Zhang, Li Yuan, Munan Ning, Wancai Zhang, Wei Liu, Wenhao Jiang, Yang Yan, Yatian Pang, ZhiFeng Li, Zongwei Li","submitted_at":"2023-10-03T07:33:27Z","abstract_excerpt":"The video-language (VL) pretraining has achieved remarkable improvement in multiple downstream tasks. However, the current VL pretraining framework is hard to extend to multiple modalities (N modalities, N>=3) beyond vision and language. We thus propose LanguageBind, taking the language as the bind across different modalities because the language modality is well-explored and contains rich semantics. Specifically, we freeze the language encoder acquired by VL pretraining, then train encoders for other modalities with contrastive learning. As a result, all modalities are mapped to a shared feat"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"LanguageBind has achieved superior performance on a wide range of 15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple experiments have provided evidence for the effectiveness of LanguageBind in achieving indirect alignment and complementarity among diverse modalities.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That a language encoder trained only on video-text pairs already contains sufficiently rich semantics to serve as an effective binding anchor for infrared, depth, and audio without direct cross-modal supervision between those modalities.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LanguageBind aligns video, infrared, depth, and audio to a frozen language encoder via contrastive learning on the new VIDAL-10M dataset, extending video-language pretraining to N modalities.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Language serves as a semantic anchor to align video, audio, depth, and infrared into one shared feature space.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3291f03733b970be4d19dec471342430ac0561520121797ae1bc1d137f83eee2"},"source":{"id":"2310.01852","kind":"arxiv","version":7},"verdict":{"id":"e202374a-68cb-4ef4-8e7f-a7458b2a1fca","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T03:22:41.992852Z","strongest_claim":"LanguageBind has achieved superior performance on a wide range of 15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple experiments have provided evidence for the effectiveness of LanguageBind in achieving indirect alignment and complementarity among diverse modalities.","one_line_summary":"LanguageBind aligns video, infrared, depth, and audio to a frozen language encoder via contrastive learning on the new VIDAL-10M dataset, extending video-language pretraining to N modalities.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That a language encoder trained only on video-text pairs already contains sufficiently rich semantics to serve as an effective binding anchor for infrared, depth, and audio without direct cross-modal supervision between those modalities.","pith_extraction_headline":"Language serves as a semantic anchor to align video, audio, depth, and infrared into one shared feature space."},"references":{"count":202,"sample":[{"doi":"","year":2017,"title":"Localizing moments in video with natural language","work_id":"60648aa4-9c56-4c4e-965b-63ce097f94a6","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2018,"title":"Convolutional neural networks for static and dynamic breast infrared imaging classification","work_id":"7aa7caf9-15c9-48f1-84eb-0a01dfc3274b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2014,"title":"Interactive intrinsic video editing","work_id":"1091ffc8-2e17-4d60-ba40-1cfec22d8ae8","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2015,"title":"Activitynet: A large-scale video benchmark for human activity understanding","work_id":"da056c16-524b-48ee-8932-184520fa61cc","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2017,"title":"Estimating depth from monocular images as classification using deep fully convolutional residual networks","work_id":"133f4176-a408-468c-b4ee-d39cccb97f9a","ref_index":6,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":202,"snapshot_sha256":"6d8055846ff12dd16c3472988b600c5720281ba362e926f713461c60240adc2b","internal_anchors":13},"formal_canon":{"evidence_count":2,"snapshot_sha256":"319357faa4f43578d3c7d4828459bdafe42a36857d37b028fdb784975e3da570"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"e202374a-68cb-4ef4-8e7f-a7458b2a1fca"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"GMXMS6LnCZPV15dmYiGziqkux2EzrOu4sSv9q5INkqVfRzzjJo89bTjZquwdjI7yywrBJCUVhykmaHsYJU7uAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T21:55:02.846870Z"},"content_sha256":"14dd9b44b33a3065bc252eee0fed022d25933913ab42777e1b6a8b591f4e5f04","schema_version":"1.0","event_id":"sha256:14dd9b44b33a3065bc252eee0fed022d25933913ab42777e1b6a8b591f4e5f04"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/QT2OC6LCZXLTZ47EOLBAFJSMSP/bundle.json","state_url":"https://pith.science/pith/QT2OC6LCZXLTZ47EOLBAFJSMSP/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/QT2OC6LCZXLTZ47EOLBAFJSMSP/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T21:55:02Z","links":{"resolver":"https://pith.science/pith/QT2OC6LCZXLTZ47EOLBAFJSMSP","bundle":"https://pith.science/pith/QT2OC6LCZXLTZ47EOLBAFJSMSP/bundle.json","state":"https://pith.science/pith/QT2OC6LCZXLTZ47EOLBAFJSMSP/state.json","well_known_bundle":"https://pith.science/.well-known/pith/QT2OC6LCZXLTZ47EOLBAFJSMSP/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:QT2OC6LCZXLTZ47EOLBAFJSMSP","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a89d3f1b0310aacfd85d336f1ea4af2f879bd89dae84a6100731d407e7c398cc","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-03T07:33:27Z","title_canon_sha256":"7e6e9944d2ba0e8600d523445e1059096ea4896ecd671d4aa18fb88ebe4f6991"},"schema_version":"1.0","source":{"id":"2310.01852","kind":"arxiv","version":7}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2310.01852","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2310.01852v7","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.01852","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"QT2OC6LCZXLT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"QT2OC6LCZXLTZ47E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"QT2OC6LC","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:14dd9b44b33a3065bc252eee0fed022d25933913ab42777e1b6a8b591f4e5f04","target":"graph","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"LanguageBind has achieved superior performance on a wide range of 15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple experiments have provided evidence for the effectiveness of LanguageBind in achieving indirect alignment and complementarity among diverse modalities."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That a language encoder trained only on video-text pairs already contains sufficiently rich semantics to serve as an effective binding anchor for infrared, depth, and audio without direct cross-modal supervision between those modalities."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LanguageBind aligns video, infrared, depth, and audio to a frozen language encoder via contrastive learning on the new VIDAL-10M dataset, extending video-language pretraining to N modalities."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Language serves as a semantic anchor to align video, audio, depth, and infrared into one shared feature space."}],"snapshot_sha256":"3291f03733b970be4d19dec471342430ac0561520121797ae1bc1d137f83eee2"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"319357faa4f43578d3c7d4828459bdafe42a36857d37b028fdb784975e3da570"},"paper":{"abstract_excerpt":"The video-language (VL) pretraining has achieved remarkable improvement in multiple downstream tasks. However, the current VL pretraining framework is hard to extend to multiple modalities (N modalities, N>=3) beyond vision and language. We thus propose LanguageBind, taking the language as the bind across different modalities because the language modality is well-explored and contains rich semantics. Specifically, we freeze the language encoder acquired by VL pretraining, then train encoders for other modalities with contrastive learning. As a result, all modalities are mapped to a shared feat","authors_text":"Bin Lin, Bin Zhu, Hongfa Wang, Jiaxi Cui, Junwu Zhang, Li Yuan, Munan Ning, Wancai Zhang, Wei Liu, Wenhao Jiang, Yang Yan, Yatian Pang, ZhiFeng Li, Zongwei Li","cross_cats":["cs.AI"],"headline":"Language serves as a semantic anchor to align video, audio, depth, and infrared into one shared feature space.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-03T07:33:27Z","title":"LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment"},"references":{"count":202,"internal_anchors":13,"resolved_work":202,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Localizing moments in video with natural language","work_id":"60648aa4-9c56-4c4e-965b-63ce097f94a6","year":2017},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Convolutional neural networks for static and dynamic breast infrared imaging classification","work_id":"7aa7caf9-15c9-48f1-84eb-0a01dfc3274b","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Interactive intrinsic video editing","work_id":"1091ffc8-2e17-4d60-ba40-1cfec22d8ae8","year":2014},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Activitynet: A large-scale video benchmark for human activity understanding","work_id":"da056c16-524b-48ee-8932-184520fa61cc","year":2015},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":6,"title":"Estimating depth from monocular images as classification using deep fully convolutional residual networks","work_id":"133f4176-a408-468c-b4ee-d39cccb97f9a","year":2017}],"snapshot_sha256":"6d8055846ff12dd16c3472988b600c5720281ba362e926f713461c60240adc2b"},"source":{"id":"2310.01852","kind":"arxiv","version":7},"verdict":{"created_at":"2026-05-17T03:22:41.992852Z","id":"e202374a-68cb-4ef4-8e7f-a7458b2a1fca","model_set":{"reader":"grok-4.3"},"one_line_summary":"LanguageBind aligns video, infrared, depth, and audio to a frozen language encoder via contrastive learning on the new VIDAL-10M dataset, extending video-language pretraining to N modalities.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Language serves as a semantic anchor to align video, audio, depth, and infrared into one shared feature space.","strongest_claim":"LanguageBind has achieved superior performance on a wide range of 15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple experiments have provided evidence for the effectiveness of LanguageBind in achieving indirect alignment and complementarity among diverse modalities.","weakest_assumption":"That a language encoder trained only on video-text pairs already contains sufficiently rich semantics to serve as an effective binding anchor for infrared, depth, and audio without direct cross-modal supervision between those modalities."}},"verdict_id":"e202374a-68cb-4ef4-8e7f-a7458b2a1fca"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8049511d5f2209b67e58f6533cfb3d8095911fee4de948704db2523a535b49e4","target":"record","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a89d3f1b0310aacfd85d336f1ea4af2f879bd89dae84a6100731d407e7c398cc","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-03T07:33:27Z","title_canon_sha256":"7e6e9944d2ba0e8600d523445e1059096ea4896ecd671d4aa18fb88ebe4f6991"},"schema_version":"1.0","source":{"id":"2310.01852","kind":"arxiv","version":7}},"canonical_sha256":"84f4e17962cdd73cf3e472c202a64c93dc2fd8ff7032b1327a7be4af869fdfc7","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"84f4e17962cdd73cf3e472c202a64c93dc2fd8ff7032b1327a7be4af869fdfc7","first_computed_at":"2026-05-17T23:38:15.255731Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:15.255731Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"yjwH6st+4N/Po4Wr1RrfXKYTaCWB1bUBg1meR6gPj6OIbipkDYUcOProk607GjQ1XZXu7BR4KvOewjYhzH/kDA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:15.256300Z","signed_message":"canonical_sha256_bytes"},"source_id":"2310.01852","source_kind":"arxiv","source_version":7}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8049511d5f2209b67e58f6533cfb3d8095911fee4de948704db2523a535b49e4","sha256:14dd9b44b33a3065bc252eee0fed022d25933913ab42777e1b6a8b591f4e5f04"],"state_sha256":"f35bf84d46a002c156d08f6522096399fa60ca77c464156be041297d7e17fe7b"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"QwQjba62OTYxr1mh89YG74oFDmHZtdUM4zSEIsCz2iNVlifQmR168n81eOihjXZ7Qir3Dxk44oQxvMU7YbhCBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T21:55:02.849559Z","bundle_sha256":"f451ecc7c6b470cc5f567ebfafafb863b3b2db58da66de37dd63234d85965f4b"}}