{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:FMDGLBOPJP5MS3CXYU6Q2SPD6G","short_pith_number":"pith:FMDGLBOP","schema_version":"1.0","canonical_sha256":"2b066585cf4bfac96c57c53d0d49e3f180511f0b45c4b337b8744c523bc18e1f","source":{"kind":"arxiv","id":"1702.05658","version":3},"attestation_state":"computed","paper":{"title":"MAT: A Multimodal Attentive Translator for Image Captioning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Alan Yuille, Changhu Wang, Chang Liu, Feng Wang, Fuchun Sun","submitted_at":"2017-02-18T21:35:06Z","abstract_excerpt":"In this work we formulate the problem of image captioning as a multimodal translation task. Analogous to machine translation, we present a sequence-to-sequence recurrent neural networks (RNN) model for image caption generation. Different from most existing work where the whole image is represented by convolutional neural network (CNN) feature, we propose to represent the input image as a sequence of detected objects which feeds as the source sequence of the RNN model. In this way, the sequential representation of an image can be naturally translated to a sequence of words, as the target sequen"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1702.05658","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-02-18T21:35:06Z","cross_cats_sorted":[],"title_canon_sha256":"dfb218df281974dda68d9439f672e608e9d52b72ba0928012e34bb87fb16938a","abstract_canon_sha256":"af064ec8c942d79a508df2c76df3a28ccc0c63ccd2c72a6060c99635aace8aed"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:38:18.259767Z","signature_b64":"i6oxjZTwL5TqORrEDGR/vnQ3sRSmowkBc+KnOWBVQ9eyIsGMPUht7yRBcm45L5Yf3eDiOxzWni3B+FiOqpJRDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2b066585cf4bfac96c57c53d0d49e3f180511f0b45c4b337b8744c523bc18e1f","last_reissued_at":"2026-05-18T00:38:18.259042Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:38:18.259042Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MAT: A Multimodal Attentive Translator for Image Captioning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Alan Yuille, Changhu Wang, Chang Liu, Feng Wang, Fuchun Sun","submitted_at":"2017-02-18T21:35:06Z","abstract_excerpt":"In this work we formulate the problem of image captioning as a multimodal translation task. Analogous to machine translation, we present a sequence-to-sequence recurrent neural networks (RNN) model for image caption generation. Different from most existing work where the whole image is represented by convolutional neural network (CNN) feature, we propose to represent the input image as a sequence of detected objects which feeds as the source sequence of the RNN model. In this way, the sequential representation of an image can be naturally translated to a sequence of words, as the target sequen"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1702.05658","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1702.05658","created_at":"2026-05-18T00:38:18.259173+00:00"},{"alias_kind":"arxiv_version","alias_value":"1702.05658v3","created_at":"2026-05-18T00:38:18.259173+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1702.05658","created_at":"2026-05-18T00:38:18.259173+00:00"},{"alias_kind":"pith_short_12","alias_value":"FMDGLBOPJP5M","created_at":"2026-05-18T12:31:15.632608+00:00"},{"alias_kind":"pith_short_16","alias_value":"FMDGLBOPJP5MS3CX","created_at":"2026-05-18T12:31:15.632608+00:00"},{"alias_kind":"pith_short_8","alias_value":"FMDGLBOP","created_at":"2026-05-18T12:31:15.632608+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"1907.08948","citing_title":"Hindi Visual Genome: A Dataset for Multimodal English-to-Hindi Machine Translation","ref_index":8,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G","json":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G.json","graph_json":"https://pith.science/api/pith-number/FMDGLBOPJP5MS3CXYU6Q2SPD6G/graph.json","events_json":"https://pith.science/api/pith-number/FMDGLBOPJP5MS3CXYU6Q2SPD6G/events.json","paper":"https://pith.science/paper/FMDGLBOP"},"agent_actions":{"view_html":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G","download_json":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G.json","view_paper":"https://pith.science/paper/FMDGLBOP","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1702.05658&json=true","fetch_graph":"https://pith.science/api/pith-number/FMDGLBOPJP5MS3CXYU6Q2SPD6G/graph.json","fetch_events":"https://pith.science/api/pith-number/FMDGLBOPJP5MS3CXYU6Q2SPD6G/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G/action/storage_attestation","attest_author":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G/action/author_attestation","sign_citation":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G/action/citation_signature","submit_replication":"https://pith.science/pith/FMDGLBOPJP5MS3CXYU6Q2SPD6G/action/replication_record"}},"created_at":"2026-05-18T00:38:18.259173+00:00","updated_at":"2026-05-18T00:38:18.259173+00:00"}