{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:XAKILETQRV726EYMPKFVLI6OAD","short_pith_number":"pith:XAKILETQ","canonical_record":{"source":{"id":"2606.01207","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-31T12:55:17Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"77d0c9ccb80785d6fcd0cb1377a8c3416888c53693df635c4768187ce6004dc8","abstract_canon_sha256":"140927cf0b3e8afc5e6ab361698df7be4fc6d0298a3535b1f4f8c98efdf962ed"},"schema_version":"1.0"},"canonical_sha256":"b8148592708d7faf130c7a8b55a3ce00eb174e461f10549c9b513a9c30058c07","source":{"kind":"arxiv","id":"2606.01207","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.01207","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"arxiv_version","alias_value":"2606.01207v1","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.01207","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"pith_short_12","alias_value":"XAKILETQRV72","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"pith_short_16","alias_value":"XAKILETQRV726EYM","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"pith_short_8","alias_value":"XAKILETQ","created_at":"2026-06-02T02:04:26Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:XAKILETQRV726EYMPKFVLI6OAD","target":"record","payload":{"canonical_record":{"source":{"id":"2606.01207","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-31T12:55:17Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"77d0c9ccb80785d6fcd0cb1377a8c3416888c53693df635c4768187ce6004dc8","abstract_canon_sha256":"140927cf0b3e8afc5e6ab361698df7be4fc6d0298a3535b1f4f8c98efdf962ed"},"schema_version":"1.0"},"canonical_sha256":"b8148592708d7faf130c7a8b55a3ce00eb174e461f10549c9b513a9c30058c07","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:26.818437Z","signature_b64":"TJuHgjAcifY90Mz5xtNVrOn3QyOeBNtYHhepVF+2WwMr/sjBHdrkKrv4z0FwEJE7oE7YTzcY0t8K+nfzDGaUAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b8148592708d7faf130c7a8b55a3ce00eb174e461f10549c9b513a9c30058c07","last_reissued_at":"2026-06-02T02:04:26.818042Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:26.818042Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.01207","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T02:04:26Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pgkaWP1UELLlCFpohVCcxHpeUGE/+Qjvlv8haD/3o3neoHdY535wHbr6aT/c+aeRMjypc+JieGNvAF1cjz7GDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T02:44:46.796155Z"},"content_sha256":"91131e07f1bbfed37e55b8193cd791463421dcedcfd07d86084542d282f8cf6c","schema_version":"1.0","event_id":"sha256:91131e07f1bbfed37e55b8193cd791463421dcedcfd07d86084542d282f8cf6c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:XAKILETQRV726EYMPKFVLI6OAD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Feature Alignment Determines Fusion Strategy: A Comparative Study of Cross-Attention and Concatenation in Multimodal Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Xuezhen Xie, Zhiqiang Zhou","submitted_at":"2026-05-31T12:55:17Z","abstract_excerpt":"The choice between cross-attention and concatenation for multimodal fusion remains governed by practitioner intuition rather than principled understanding. In this paper, we demonstrate that feature alignment quality, not data scale alone, is the primary determinant of which fusion strategy excels. Through controlled experiments on Flickr8k using two feature extraction backbones (ResNet18 and CLIP ViT-B/32), we show that concatenation outperforms cross-attention by 4.1-5.1 percentage points across all tested scales (2048-16384 samples) when features are pre-aligned by a vision-language pretrai"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.01207","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.01207/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T02:04:26Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"dwOBqj9gs328oHWsLh0NZtVEANNU0x7rKqK85RDWK/IZQ8bU0YBDrwsSpUbzqS0AqlJNIeH+ymghJs8IVOpABQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-30T02:44:46.796548Z"},"content_sha256":"6d8226412fad89d7bb01fd1ac275d41138a088ed9fa40d9ad8218be84dbe7de3","schema_version":"1.0","event_id":"sha256:6d8226412fad89d7bb01fd1ac275d41138a088ed9fa40d9ad8218be84dbe7de3"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/XAKILETQRV726EYMPKFVLI6OAD/bundle.json","state_url":"https://pith.science/pith/XAKILETQRV726EYMPKFVLI6OAD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/XAKILETQRV726EYMPKFVLI6OAD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-30T02:44:46Z","links":{"resolver":"https://pith.science/pith/XAKILETQRV726EYMPKFVLI6OAD","bundle":"https://pith.science/pith/XAKILETQRV726EYMPKFVLI6OAD/bundle.json","state":"https://pith.science/pith/XAKILETQRV726EYMPKFVLI6OAD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/XAKILETQRV726EYMPKFVLI6OAD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:XAKILETQRV726EYMPKFVLI6OAD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"140927cf0b3e8afc5e6ab361698df7be4fc6d0298a3535b1f4f8c98efdf962ed","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-31T12:55:17Z","title_canon_sha256":"77d0c9ccb80785d6fcd0cb1377a8c3416888c53693df635c4768187ce6004dc8"},"schema_version":"1.0","source":{"id":"2606.01207","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.01207","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"arxiv_version","alias_value":"2606.01207v1","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.01207","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"pith_short_12","alias_value":"XAKILETQRV72","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"pith_short_16","alias_value":"XAKILETQRV726EYM","created_at":"2026-06-02T02:04:26Z"},{"alias_kind":"pith_short_8","alias_value":"XAKILETQ","created_at":"2026-06-02T02:04:26Z"}],"graph_snapshots":[{"event_id":"sha256:6d8226412fad89d7bb01fd1ac275d41138a088ed9fa40d9ad8218be84dbe7de3","target":"graph","created_at":"2026-06-02T02:04:26Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.01207/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"The choice between cross-attention and concatenation for multimodal fusion remains governed by practitioner intuition rather than principled understanding. In this paper, we demonstrate that feature alignment quality, not data scale alone, is the primary determinant of which fusion strategy excels. Through controlled experiments on Flickr8k using two feature extraction backbones (ResNet18 and CLIP ViT-B/32), we show that concatenation outperforms cross-attention by 4.1-5.1 percentage points across all tested scales (2048-16384 samples) when features are pre-aligned by a vision-language pretrai","authors_text":"Xuezhen Xie, Zhiqiang Zhou","cross_cats":["cs.LG"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-31T12:55:17Z","title":"Feature Alignment Determines Fusion Strategy: A Comparative Study of Cross-Attention and Concatenation in Multimodal Learning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.01207","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:91131e07f1bbfed37e55b8193cd791463421dcedcfd07d86084542d282f8cf6c","target":"record","created_at":"2026-06-02T02:04:26Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"140927cf0b3e8afc5e6ab361698df7be4fc6d0298a3535b1f4f8c98efdf962ed","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-31T12:55:17Z","title_canon_sha256":"77d0c9ccb80785d6fcd0cb1377a8c3416888c53693df635c4768187ce6004dc8"},"schema_version":"1.0","source":{"id":"2606.01207","kind":"arxiv","version":1}},"canonical_sha256":"b8148592708d7faf130c7a8b55a3ce00eb174e461f10549c9b513a9c30058c07","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b8148592708d7faf130c7a8b55a3ce00eb174e461f10549c9b513a9c30058c07","first_computed_at":"2026-06-02T02:04:26.818042Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T02:04:26.818042Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"TJuHgjAcifY90Mz5xtNVrOn3QyOeBNtYHhepVF+2WwMr/sjBHdrkKrv4z0FwEJE7oE7YTzcY0t8K+nfzDGaUAQ==","signature_status":"signed_v1","signed_at":"2026-06-02T02:04:26.818437Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.01207","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:91131e07f1bbfed37e55b8193cd791463421dcedcfd07d86084542d282f8cf6c","sha256:6d8226412fad89d7bb01fd1ac275d41138a088ed9fa40d9ad8218be84dbe7de3"],"state_sha256":"53dced38d39dc84f88adb5c70980eb2d3bb1e8f4ad955b0232b3c9d8284c7f7f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kXF9e7o+Ut41EOvbXd97f2IrWyh+SWSjdwTUJXZjB6l3RN0j1zKkrb9WOSmqpZSceS0KJoBJJ1CjZOaI5Xy/Dw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-30T02:44:46.798787Z","bundle_sha256":"7ea8895d7f84806dab295e9a5939c87dbd48175d94a39dc6ab6ee648e64a2605"}}