{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:QSMX2Y57YN4QLS3ESHTMTNHUL5","short_pith_number":"pith:QSMX2Y57","schema_version":"1.0","canonical_sha256":"84997d63bfc37905cb6491e6c9b4f45f48a40719c79f9d5206b51acb6d8b7b8b","source":{"kind":"arxiv","id":"1906.09890","version":2},"attestation_state":"computed","paper":{"title":"Self Multi-Head Attention for Speaker Recognition","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.SD","authors_text":"Javier Hernando, Miquel India, Pooyan Safari","submitted_at":"2019-06-24T12:44:09Z","abstract_excerpt":"Most state-of-the-art Deep Learning (DL) approaches for speaker recognition work on a short utterance level. Given the speech signal, these algorithms extract a sequence of speaker embeddings from short segments and those are averaged to obtain an utterance level speaker representation. In this work we propose the use of an attention mechanism to obtain a discriminative speaker embedding given non fixed length speech utterances. Our system is based on a Convolutional Neural Network (CNN) that encodes short-term speaker features from the spectrogram and a self multi-head attention model that ma"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1906.09890","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SD","submitted_at":"2019-06-24T12:44:09Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"f403bed1c2735889cda01a30c5fd58b5f7c916c978a7f2bd8f710a593c8e1016","abstract_canon_sha256":"a411a5d583bfed4bc91e42134491d83a3cbd1c6ff340ee15f50ad352579d0910"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:41:41.712807Z","signature_b64":"taBuqKxoPpkByBI1urpZu5M28rTn1xDLn/moo7yrBcgSz/Ah+mo/CVc3TLSOgb0u+ZsiNQ3feTrNlOEzsi4YDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"84997d63bfc37905cb6491e6c9b4f45f48a40719c79f9d5206b51acb6d8b7b8b","last_reissued_at":"2026-05-17T23:41:41.712346Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:41:41.712346Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Self Multi-Head Attention for Speaker Recognition","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.SD","authors_text":"Javier Hernando, Miquel India, Pooyan Safari","submitted_at":"2019-06-24T12:44:09Z","abstract_excerpt":"Most state-of-the-art Deep Learning (DL) approaches for speaker recognition work on a short utterance level. Given the speech signal, these algorithms extract a sequence of speaker embeddings from short segments and those are averaged to obtain an utterance level speaker representation. In this work we propose the use of an attention mechanism to obtain a discriminative speaker embedding given non fixed length speech utterances. Our system is based on a Convolutional Neural Network (CNN) that encodes short-term speaker features from the spectrogram and a self multi-head attention model that ma"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1906.09890","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1906.09890","created_at":"2026-05-17T23:41:41.712426+00:00"},{"alias_kind":"arxiv_version","alias_value":"1906.09890v2","created_at":"2026-05-17T23:41:41.712426+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1906.09890","created_at":"2026-05-17T23:41:41.712426+00:00"},{"alias_kind":"pith_short_12","alias_value":"QSMX2Y57YN4Q","created_at":"2026-05-18T12:33:27.125529+00:00"},{"alias_kind":"pith_short_16","alias_value":"QSMX2Y57YN4QLS3E","created_at":"2026-05-18T12:33:27.125529+00:00"},{"alias_kind":"pith_short_8","alias_value":"QSMX2Y57","created_at":"2026-05-18T12:33:27.125529+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"1906.09890","citing_title":"Self Multi-Head Attention for Speaker Recognition","ref_index":1,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5","json":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5.json","graph_json":"https://pith.science/api/pith-number/QSMX2Y57YN4QLS3ESHTMTNHUL5/graph.json","events_json":"https://pith.science/api/pith-number/QSMX2Y57YN4QLS3ESHTMTNHUL5/events.json","paper":"https://pith.science/paper/QSMX2Y57"},"agent_actions":{"view_html":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5","download_json":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5.json","view_paper":"https://pith.science/paper/QSMX2Y57","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1906.09890&json=true","fetch_graph":"https://pith.science/api/pith-number/QSMX2Y57YN4QLS3ESHTMTNHUL5/graph.json","fetch_events":"https://pith.science/api/pith-number/QSMX2Y57YN4QLS3ESHTMTNHUL5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5/action/storage_attestation","attest_author":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5/action/author_attestation","sign_citation":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5/action/citation_signature","submit_replication":"https://pith.science/pith/QSMX2Y57YN4QLS3ESHTMTNHUL5/action/replication_record"}},"created_at":"2026-05-17T23:41:41.712426+00:00","updated_at":"2026-05-17T23:41:41.712426+00:00"}