{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2014:W67U2AU7JAHQMQT3OB42GMRX4P","short_pith_number":"pith:W67U2AU7","schema_version":"1.0","canonical_sha256":"b7bf4d029f480f06427b7079a33237e3dc3168ef7a62099b725f2fb28dc55910","source":{"kind":"arxiv","id":"1405.4053","version":2},"attestation_state":"computed","paper":{"title":"Distributed Representations of Sentences and Documents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Quoc V. Le, Tomas Mikolov","submitted_at":"2014-05-16T07:12:16Z","abstract_excerpt":"Many machine learning algorithms require the input to be represented as a fixed-length feature vector. When it comes to texts, one of the most common fixed-length features is bag-of-words. Despite their popularity, bag-of-words features have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words. For example, \"powerful,\" \"strong\" and \"Paris\" are equally distant. In this paper, we propose Paragraph Vector, an unsupervised algorithm that learns fixed-length feature representations from variable-length pieces of texts, such as sentences, paragraphs, "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1405.4053","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2014-05-16T07:12:16Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"491551892fe1faeb275ddfd272ff395ac1365efbcfa0dbb76fd066fbac7d2758","abstract_canon_sha256":"83a3c50553150e6b09254efe7e20168fc4b33039bd04e547904094bbdc2cd198"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:51:14.577113Z","signature_b64":"miS6syFu+4fn/Gd21UEYzr4JaxGMxw5CI9gG56Ubw3uFkKGnwwidT9ePOET9nFZAGjxSrOCotib/IQQHL69sDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b7bf4d029f480f06427b7079a33237e3dc3168ef7a62099b725f2fb28dc55910","last_reissued_at":"2026-05-18T02:51:14.576643Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:51:14.576643Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Distributed Representations of Sentences and Documents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Quoc V. Le, Tomas Mikolov","submitted_at":"2014-05-16T07:12:16Z","abstract_excerpt":"Many machine learning algorithms require the input to be represented as a fixed-length feature vector. When it comes to texts, one of the most common fixed-length features is bag-of-words. Despite their popularity, bag-of-words features have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words. For example, \"powerful,\" \"strong\" and \"Paris\" are equally distant. In this paper, we propose Paragraph Vector, an unsupervised algorithm that learns fixed-length feature representations from variable-length pieces of texts, such as sentences, paragraphs, "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1405.4053","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1405.4053","created_at":"2026-05-18T02:51:14.576712+00:00"},{"alias_kind":"arxiv_version","alias_value":"1405.4053v2","created_at":"2026-05-18T02:51:14.576712+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1405.4053","created_at":"2026-05-18T02:51:14.576712+00:00"},{"alias_kind":"pith_short_12","alias_value":"W67U2AU7JAHQ","created_at":"2026-05-18T12:28:54.890064+00:00"},{"alias_kind":"pith_short_16","alias_value":"W67U2AU7JAHQMQT3","created_at":"2026-05-18T12:28:54.890064+00:00"},{"alias_kind":"pith_short_8","alias_value":"W67U2AU7","created_at":"2026-05-18T12:28:54.890064+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"1907.06600","citing_title":"Medical Concept Representation Learning from Claims Data and Application to Health Plan Payment Risk Adjustment","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"1907.08167","citing_title":"OCC: A Smart Reply System for Efficient In-App Communications","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"1909.01066","citing_title":"Language Models as Knowledge Bases?","ref_index":261,"is_internal_anchor":true},{"citing_arxiv_id":"1603.08983","citing_title":"Adaptive Computation Time for Recurrent Neural Networks","ref_index":20,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P","json":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P.json","graph_json":"https://pith.science/api/pith-number/W67U2AU7JAHQMQT3OB42GMRX4P/graph.json","events_json":"https://pith.science/api/pith-number/W67U2AU7JAHQMQT3OB42GMRX4P/events.json","paper":"https://pith.science/paper/W67U2AU7"},"agent_actions":{"view_html":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P","download_json":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P.json","view_paper":"https://pith.science/paper/W67U2AU7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1405.4053&json=true","fetch_graph":"https://pith.science/api/pith-number/W67U2AU7JAHQMQT3OB42GMRX4P/graph.json","fetch_events":"https://pith.science/api/pith-number/W67U2AU7JAHQMQT3OB42GMRX4P/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P/action/timestamp_anchor","attest_storage":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P/action/storage_attestation","attest_author":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P/action/author_attestation","sign_citation":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P/action/citation_signature","submit_replication":"https://pith.science/pith/W67U2AU7JAHQMQT3OB42GMRX4P/action/replication_record"}},"created_at":"2026-05-18T02:51:14.576712+00:00","updated_at":"2026-05-18T02:51:14.576712+00:00"}