{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2015:VAVHLJCEDGTJI4UB7GQQY5F2SH","short_pith_number":"pith:VAVHLJCE","schema_version":"1.0","canonical_sha256":"a82a75a44419a6947281f9a10c74ba91d801f905ed21f54ad5d58cf2e00a0b23","source":{"kind":"arxiv","id":"1506.06272","version":1},"attestation_state":"computed","paper":{"title":"Aligning where to see and what to tell: image caption with region-based attention and scene factorization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CV","authors_text":"Changshui Zhang, Fei Sha, Junqi Jin, Kun Fu, Runpeng Cui","submitted_at":"2015-06-20T17:25:38Z","abstract_excerpt":"Recent progress on automatic generation of image captions has shown that it is possible to describe the most salient information conveyed by images with accurate and meaningful sentences. In this paper, we propose an image caption system that exploits the parallel structures between images and sentences. In our model, the process of generating the next word, given the previously generated ones, is aligned with the visual perception experience where the attention shifting among the visual regions imposes a thread of visual ordering. This alignment characterizes the flow of \"abstract meaning\", e"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1506.06272","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-06-20T17:25:38Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"ccc4134e67b6ba2f0ceb17f9fdd22d3793d57f93f0c40c22f3e8a3896444c927","abstract_canon_sha256":"008412d9b19a5e9e34e86b7762126ce9ebe8118f8558c55f7e73b60789eb94e6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:41:45.640911Z","signature_b64":"B6/oSPmNPQatmnc2X9Nti3xAeS39C6p8I8lCm56iByYGNW7iG/JrrJdWFUZ3h7OlUzg0GTrh75q1YlYnQ6JiDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a82a75a44419a6947281f9a10c74ba91d801f905ed21f54ad5d58cf2e00a0b23","last_reissued_at":"2026-05-18T01:41:45.640415Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:41:45.640415Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Aligning where to see and what to tell: image caption with region-based attention and scene factorization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CV","authors_text":"Changshui Zhang, Fei Sha, Junqi Jin, Kun Fu, Runpeng Cui","submitted_at":"2015-06-20T17:25:38Z","abstract_excerpt":"Recent progress on automatic generation of image captions has shown that it is possible to describe the most salient information conveyed by images with accurate and meaningful sentences. In this paper, we propose an image caption system that exploits the parallel structures between images and sentences. In our model, the process of generating the next word, given the previously generated ones, is aligned with the visual perception experience where the attention shifting among the visual regions imposes a thread of visual ordering. This alignment characterizes the flow of \"abstract meaning\", e"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1506.06272","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1506.06272","created_at":"2026-05-18T01:41:45.640509+00:00"},{"alias_kind":"arxiv_version","alias_value":"1506.06272v1","created_at":"2026-05-18T01:41:45.640509+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1506.06272","created_at":"2026-05-18T01:41:45.640509+00:00"},{"alias_kind":"pith_short_12","alias_value":"VAVHLJCEDGTJ","created_at":"2026-05-18T12:29:44.643036+00:00"},{"alias_kind":"pith_short_16","alias_value":"VAVHLJCEDGTJI4UB","created_at":"2026-05-18T12:29:44.643036+00:00"},{"alias_kind":"pith_short_8","alias_value":"VAVHLJCE","created_at":"2026-05-18T12:29:44.643036+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH","json":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH.json","graph_json":"https://pith.science/api/pith-number/VAVHLJCEDGTJI4UB7GQQY5F2SH/graph.json","events_json":"https://pith.science/api/pith-number/VAVHLJCEDGTJI4UB7GQQY5F2SH/events.json","paper":"https://pith.science/paper/VAVHLJCE"},"agent_actions":{"view_html":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH","download_json":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH.json","view_paper":"https://pith.science/paper/VAVHLJCE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1506.06272&json=true","fetch_graph":"https://pith.science/api/pith-number/VAVHLJCEDGTJI4UB7GQQY5F2SH/graph.json","fetch_events":"https://pith.science/api/pith-number/VAVHLJCEDGTJI4UB7GQQY5F2SH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH/action/storage_attestation","attest_author":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH/action/author_attestation","sign_citation":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH/action/citation_signature","submit_replication":"https://pith.science/pith/VAVHLJCEDGTJI4UB7GQQY5F2SH/action/replication_record"}},"created_at":"2026-05-18T01:41:45.640509+00:00","updated_at":"2026-05-18T01:41:45.640509+00:00"}