{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:VBOLZ6LWIHUSFUUQ7VAST4C4HO","short_pith_number":"pith:VBOLZ6LW","canonical_record":{"source":{"id":"2607.01667","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-07-02T03:47:45Z","cross_cats_sorted":[],"title_canon_sha256":"cc21ae0aafc7e948bdf0d59d58d4671a3003a9bf2b2b60757e8e73f5f8982a4e","abstract_canon_sha256":"460c2823235c4ba5d3da36c63230908b72eeacff019b867ab230d01c7c7295e9"},"schema_version":"1.0"},"canonical_sha256":"a85cbcf97641e922d290fd4129f05c3b8659a155f1825de51c232973c0ee6713","source":{"kind":"arxiv","id":"2607.01667","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2607.01667","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"arxiv_version","alias_value":"2607.01667v1","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2607.01667","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"pith_short_12","alias_value":"VBOLZ6LWIHUS","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"pith_short_16","alias_value":"VBOLZ6LWIHUSFUUQ","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"pith_short_8","alias_value":"VBOLZ6LW","created_at":"2026-07-03T01:17:26Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:VBOLZ6LWIHUSFUUQ7VAST4C4HO","target":"record","payload":{"canonical_record":{"source":{"id":"2607.01667","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-07-02T03:47:45Z","cross_cats_sorted":[],"title_canon_sha256":"cc21ae0aafc7e948bdf0d59d58d4671a3003a9bf2b2b60757e8e73f5f8982a4e","abstract_canon_sha256":"460c2823235c4ba5d3da36c63230908b72eeacff019b867ab230d01c7c7295e9"},"schema_version":"1.0"},"canonical_sha256":"a85cbcf97641e922d290fd4129f05c3b8659a155f1825de51c232973c0ee6713","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-03T01:17:26.140858Z","signature_b64":"FiV4eeyg6dQfW31hldNFu0sxfUROwLo0wVXotlVDln87YOsqrc9LSCIW78bl9UrPogm+NQyMO5s/qEfdavPqDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a85cbcf97641e922d290fd4129f05c3b8659a155f1825de51c232973c0ee6713","last_reissued_at":"2026-07-03T01:17:26.140261Z","signature_status":"signed_v1","first_computed_at":"2026-07-03T01:17:26.140261Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2607.01667","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-07-03T01:17:26Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HrgpjLC1McLN3c8rNN17YxigEiPsmk/EaV4YocGK87HAcbZYeyPUlcSTR0wKexWFOFcnLorM5j/hKLg9TjKmCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-04T07:57:12.068131Z"},"content_sha256":"abd9849457482ad2198c1328da0fc2c157ce14ed06eefd72b56b347948fc836c","schema_version":"1.0","event_id":"sha256:abd9849457482ad2198c1328da0fc2c157ce14ed06eefd72b56b347948fc836c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:VBOLZ6LWIHUSFUUQ7VAST4C4HO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Temporal and Cross-Modal Alignment for Enhanced Audiovisual Video Captioning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chen Zhao, Hongyu Li, Jiajun Ma, Jian Yang, Qilong Huang, Tiehan Fan, Xiaoming Wei, Ying Tai, Zhuoliang Kang","submitted_at":"2026-07-02T03:47:45Z","abstract_excerpt":"While Multimodal Large Language Models (MLLMs) have advanced video understanding, achieving precise temporal and cross-modal alignment in audiovisual video captioning remains a formidable challenge. Most existing approaches suffer from modality detachment and temporal incoherence, failing to accurately bind auditory events to visual entities or capture complex causal dynamics. To address these deficiencies, we propose TCA-Captioner, a framework specifically engineered to enhance Temporal and Cross-Modal Alignment for audiovisual video captioning. We first introduce the Observer-Checker-Correct"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2607.01667","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2607.01667/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-07-03T01:17:26Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tO7NzOfEt/29rU+IhQ8pW0akWahm1I5FixKohcBu2y2b1zd3dR8WH8zpaMgWzrYNI4ukIcn3YjYAAy522KmPCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-04T07:57:12.068528Z"},"content_sha256":"ff176374dc032812516928122bc4086fcac286807a70c13f9beceb098fc5ec9c","schema_version":"1.0","event_id":"sha256:ff176374dc032812516928122bc4086fcac286807a70c13f9beceb098fc5ec9c"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VBOLZ6LWIHUSFUUQ7VAST4C4HO/bundle.json","state_url":"https://pith.science/pith/VBOLZ6LWIHUSFUUQ7VAST4C4HO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VBOLZ6LWIHUSFUUQ7VAST4C4HO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-04T07:57:12Z","links":{"resolver":"https://pith.science/pith/VBOLZ6LWIHUSFUUQ7VAST4C4HO","bundle":"https://pith.science/pith/VBOLZ6LWIHUSFUUQ7VAST4C4HO/bundle.json","state":"https://pith.science/pith/VBOLZ6LWIHUSFUUQ7VAST4C4HO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VBOLZ6LWIHUSFUUQ7VAST4C4HO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:VBOLZ6LWIHUSFUUQ7VAST4C4HO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"460c2823235c4ba5d3da36c63230908b72eeacff019b867ab230d01c7c7295e9","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-07-02T03:47:45Z","title_canon_sha256":"cc21ae0aafc7e948bdf0d59d58d4671a3003a9bf2b2b60757e8e73f5f8982a4e"},"schema_version":"1.0","source":{"id":"2607.01667","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2607.01667","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"arxiv_version","alias_value":"2607.01667v1","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2607.01667","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"pith_short_12","alias_value":"VBOLZ6LWIHUS","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"pith_short_16","alias_value":"VBOLZ6LWIHUSFUUQ","created_at":"2026-07-03T01:17:26Z"},{"alias_kind":"pith_short_8","alias_value":"VBOLZ6LW","created_at":"2026-07-03T01:17:26Z"}],"graph_snapshots":[{"event_id":"sha256:ff176374dc032812516928122bc4086fcac286807a70c13f9beceb098fc5ec9c","target":"graph","created_at":"2026-07-03T01:17:26Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2607.01667/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"While Multimodal Large Language Models (MLLMs) have advanced video understanding, achieving precise temporal and cross-modal alignment in audiovisual video captioning remains a formidable challenge. Most existing approaches suffer from modality detachment and temporal incoherence, failing to accurately bind auditory events to visual entities or capture complex causal dynamics. To address these deficiencies, we propose TCA-Captioner, a framework specifically engineered to enhance Temporal and Cross-Modal Alignment for audiovisual video captioning. We first introduce the Observer-Checker-Correct","authors_text":"Chen Zhao, Hongyu Li, Jiajun Ma, Jian Yang, Qilong Huang, Tiehan Fan, Xiaoming Wei, Ying Tai, Zhuoliang Kang","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-07-02T03:47:45Z","title":"Temporal and Cross-Modal Alignment for Enhanced Audiovisual Video Captioning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2607.01667","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:abd9849457482ad2198c1328da0fc2c157ce14ed06eefd72b56b347948fc836c","target":"record","created_at":"2026-07-03T01:17:26Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"460c2823235c4ba5d3da36c63230908b72eeacff019b867ab230d01c7c7295e9","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-07-02T03:47:45Z","title_canon_sha256":"cc21ae0aafc7e948bdf0d59d58d4671a3003a9bf2b2b60757e8e73f5f8982a4e"},"schema_version":"1.0","source":{"id":"2607.01667","kind":"arxiv","version":1}},"canonical_sha256":"a85cbcf97641e922d290fd4129f05c3b8659a155f1825de51c232973c0ee6713","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a85cbcf97641e922d290fd4129f05c3b8659a155f1825de51c232973c0ee6713","first_computed_at":"2026-07-03T01:17:26.140261Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-07-03T01:17:26.140261Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"FiV4eeyg6dQfW31hldNFu0sxfUROwLo0wVXotlVDln87YOsqrc9LSCIW78bl9UrPogm+NQyMO5s/qEfdavPqDQ==","signature_status":"signed_v1","signed_at":"2026-07-03T01:17:26.140858Z","signed_message":"canonical_sha256_bytes"},"source_id":"2607.01667","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:abd9849457482ad2198c1328da0fc2c157ce14ed06eefd72b56b347948fc836c","sha256:ff176374dc032812516928122bc4086fcac286807a70c13f9beceb098fc5ec9c"],"state_sha256":"3369c16450a7468aacc44c03aa5abc84e0fc0cbe164d1695ac70085b7d20aca2"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"sfbmGdvZCbiowqrKGHehuUtVpD7fAXRbefpCOvdlwvVZV1NSRBw6npcCMnY48o0oQgwMst6qzXotprHUxp8XAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-04T07:57:12.070616Z","bundle_sha256":"f02fee6b4d77f0b0398fa26559ddad13fe9d7d655fe7327608317f6aca8e1e97"}}