{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:TGUESTQNQDO7XREYQJTEADR6PW","short_pith_number":"pith:TGUESTQN","canonical_record":{"source":{"id":"2507.16632","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55Z","cross_cats_sorted":["cs.SD","eess.AS"],"title_canon_sha256":"b646849e8a7c531836a0955a51e33d7a166c3109cb62ee09014c8258dc259d25","abstract_canon_sha256":"d058c8b802bb2451222fdb49d5ad5886e9f7b768f3dbf4d6cc331027cd01b905"},"schema_version":"1.0"},"canonical_sha256":"99a8494e0d80ddfbc4988266400e3e7da65ffb3402ae02d35f2cf6aeb0238abc","source":{"kind":"arxiv","id":"2507.16632","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2507.16632","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2507.16632v3","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.16632","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"TGUESTQNQDO7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"TGUESTQNQDO7XREY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"TGUESTQN","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:TGUESTQNQDO7XREYQJTEADR6PW","target":"record","payload":{"canonical_record":{"source":{"id":"2507.16632","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55Z","cross_cats_sorted":["cs.SD","eess.AS"],"title_canon_sha256":"b646849e8a7c531836a0955a51e33d7a166c3109cb62ee09014c8258dc259d25","abstract_canon_sha256":"d058c8b802bb2451222fdb49d5ad5886e9f7b768f3dbf4d6cc331027cd01b905"},"schema_version":"1.0"},"canonical_sha256":"99a8494e0d80ddfbc4988266400e3e7da65ffb3402ae02d35f2cf6aeb0238abc","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.906866Z","signature_b64":"OyKn5opcSa/7veOMlgwTDsTnyHccdo+D/9dxgqWi3cByrBVK/ygIN3Pu1zNRXoNjKc9I9jVIHJahtbVXBQRdBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"99a8494e0d80ddfbc4988266400e3e7da65ffb3402ae02d35f2cf6aeb0238abc","last_reissued_at":"2026-05-17T23:38:48.906177Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.906177Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2507.16632","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ks1SubdIOZ/hiIl2hxQ3BvpnO4BAI4btkN65hhSIm+WCPM7QpdwTHvnXemNErbCsbrMqT64E//3etGnFFt9EBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T18:24:58.422284Z"},"content_sha256":"f627d7b45307a5424dc295b8cbffde7639461f244f62f0f6afa7fb9559294a01","schema_version":"1.0","event_id":"sha256:f627d7b45307a5424dc295b8cbffde7639461f244f62f0f6afa7fb9559294a01"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:TGUESTQNQDO7XREYQJTEADR6PW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Step-Audio 2 Technical Report","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Step-Audio 2 integrates latent audio encoding and discrete token generation to deliver state-of-the-art audio understanding and expressive end-to-end speech conversation.","cross_cats":["cs.SD","eess.AS"],"primary_cat":"cs.CL","authors_text":"Bingxin Li, Bin Wang, Binxing Jiao, Bo Li, Boyong Wu, Brian Li, Buyun Ma, Changhe Song, Changxin Miao, Changyi Wan, Chao Yan, Che Liu, Chengli Feng, Cheng Yi, Chen Hu, Chen Xu, Dapeng Shi, Daxin Jiang, Dingyuan Hu, Donghang Wu, Dongqing Pang, Enle Liu, Fei Tian, Feiyu Shen, Gang Yu, Guanzhe Huang, Gulin Yan, Guoqiang Hu, Haiyang Sun, Hanpeng Hu, Han Zhang, Haonan Jia, Hao Nie, Haoyang Zhang, Heung-Yeung Shum, Hongyu Zhou, Jiangjie Zhen, Jianjian Sun, Jiansheng Chen, Jiaoren Wu, Jie Wu, Jie Yang, Jingbei Li, Jing Li, Jin Yang, Junzhe Lin, Kaixiang Li, Kang An, Lei Yang, Liying Shi, Li Zhou, Longlong Gu, Ming Li, Mingliang Li, Mingrui Chen, Mingxiao Li, Nan Wu, Na Wang, Peng Liu, Qi Han, Qinyuan Tan, Shaoliang Pang, Shengjie Fan, Shuli Gao, Siqi Liu, Siyu Chen, Song Yuan, Tiancheng Cao, Wang You, Wanying Lu, Wei Ji, Wen Li, Wenqing He, Wen Sun, Wuxun Xie, Xiangyu Tony Zhang, Xiangyu Zhang, Xingyuan Li, Xuan Wen, Xuelin Zhang, Xueqi Li, Xuerui Yang, Xu Zhao, Yanbo Yu, Yang Yang, Yayue Deng, Yechang Huang, Yibo Zhu, Yifan Lu, Yilei Wang, Yi Liu, Yimin Jiang, Yong Ren, Yuanhao Ding, Yuankai Ma, Yuanwei Liang, Yuanwei Lu, Yuchu Luo, Yufan Lu, Yuhe Yin, Yumeng Zhan, Yuxiang Yang, Yuxiang Zhang, Yuxin Li, Yuxin Zhang, Yu Zhou, Zhao You, Zidong Yang, Zixin Zhang","submitted_at":"2025-07-22T14:23:55Z","abstract_excerpt":"This paper presents Step-Audio 2, an end-to-end multi-modal large language model designed for industry-strength audio understanding and speech conversation. By integrating a latent audio encoder and reasoning-centric reinforcement learning (RL), Step-Audio 2 achieves promising performance in automatic speech recognition (ASR) and audio understanding. To facilitate genuine end-to-end speech conversation, Step-Audio 2 incorporates the generation of discrete audio tokens into language modeling, significantly enhancing its responsiveness to paralinguistic information such as speaking styles and em"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Step-Audio 2 achieves state-of-the-art performance on various audio understanding and conversational benchmarks compared to other open-source and commercial solutions.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the combination of latent audio encoding, reasoning-centric RL, discrete token generation, and RAG integration produces robust, generalizable performance on real-world conversational tasks beyond the reported benchmarks.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Step-Audio 2 integrates latent audio encoding and discrete token generation to deliver state-of-the-art audio understanding and expressive end-to-end speech conversation.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7936797df1eb8c38ab969b0dd9c45726c8320e0e3805d7682e8f85b621239f53"},"source":{"id":"2507.16632","kind":"arxiv","version":3},"verdict":{"id":"48451f31-1b95-4ab4-9540-42d0a7c8faf5","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T05:55:34.925173Z","strongest_claim":"Step-Audio 2 achieves state-of-the-art performance on various audio understanding and conversational benchmarks compared to other open-source and commercial solutions.","one_line_summary":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the combination of latent audio encoding, reasoning-centric RL, discrete token generation, and RAG integration produces robust, generalizable performance on real-world conversational tasks beyond the reported benchmarks.","pith_extraction_headline":"Step-Audio 2 integrates latent audio encoding and discrete token generation to deliver state-of-the-art audio understanding and expressive end-to-end speech conversation."},"references":{"count":84,"sample":[{"doi":"","year":2024,"title":"Seed-TTS: A Family of High-Quality Versatile Speech Generation Models","work_id":"6e88ee95-1133-4302-a142-cdf8f9456a8d","ref_index":1,"cited_arxiv_id":"2406.02430","is_internal_anchor":true},{"doi":"","year":2023,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","ref_index":2,"cited_arxiv_id":"2305.10403","is_internal_anchor":true},{"doi":"","year":2020,"title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","work_id":"453ebac7-2aaa-4384-b2aa-2f73ad059753","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","ref_index":4,"cited_arxiv_id":"2309.16609","is_internal_anchor":true},{"doi":"","year":2024,"title":"Seed-asr: Understanding diverse speech and contexts with llm-based speech recognition","work_id":"c5c60033-9068-454d-8df1-52efb011f98b","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":84,"snapshot_sha256":"438837758a1d1de5f02c49a6453185cf913d569d494128e5acd5ea428f171eac","internal_anchors":23},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"48451f31-1b95-4ab4-9540-42d0a7c8faf5"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"To5SIjKqZWtzc7EKdEVw62eZpRGEyCbGmlprCH6MwqBaZtPQTMJ3lXeI5aN6FuD+g7I76H4qpsW4P9mYC78uBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T18:24:58.423273Z"},"content_sha256":"12083b7a918a4d9c5902a3f553b307a68da77d604888548f931b82b3c2cc624a","schema_version":"1.0","event_id":"sha256:12083b7a918a4d9c5902a3f553b307a68da77d604888548f931b82b3c2cc624a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/TGUESTQNQDO7XREYQJTEADR6PW/bundle.json","state_url":"https://pith.science/pith/TGUESTQNQDO7XREYQJTEADR6PW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/TGUESTQNQDO7XREYQJTEADR6PW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T18:24:58Z","links":{"resolver":"https://pith.science/pith/TGUESTQNQDO7XREYQJTEADR6PW","bundle":"https://pith.science/pith/TGUESTQNQDO7XREYQJTEADR6PW/bundle.json","state":"https://pith.science/pith/TGUESTQNQDO7XREYQJTEADR6PW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/TGUESTQNQDO7XREYQJTEADR6PW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:TGUESTQNQDO7XREYQJTEADR6PW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d058c8b802bb2451222fdb49d5ad5886e9f7b768f3dbf4d6cc331027cd01b905","cross_cats_sorted":["cs.SD","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55Z","title_canon_sha256":"b646849e8a7c531836a0955a51e33d7a166c3109cb62ee09014c8258dc259d25"},"schema_version":"1.0","source":{"id":"2507.16632","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2507.16632","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2507.16632v3","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.16632","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"TGUESTQNQDO7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"TGUESTQNQDO7XREY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"TGUESTQN","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:12083b7a918a4d9c5902a3f553b307a68da77d604888548f931b82b3c2cc624a","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Step-Audio 2 achieves state-of-the-art performance on various audio understanding and conversational benchmarks compared to other open-source and commercial solutions."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the combination of latent audio encoding, reasoning-centric RL, discrete token generation, and RAG integration produces robust, generalizable performance on real-world conversational tasks beyond the reported benchmarks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Step-Audio 2 integrates latent audio encoding and discrete token generation to deliver state-of-the-art audio understanding and expressive end-to-end speech conversation."}],"snapshot_sha256":"7936797df1eb8c38ab969b0dd9c45726c8320e0e3805d7682e8f85b621239f53"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"This paper presents Step-Audio 2, an end-to-end multi-modal large language model designed for industry-strength audio understanding and speech conversation. By integrating a latent audio encoder and reasoning-centric reinforcement learning (RL), Step-Audio 2 achieves promising performance in automatic speech recognition (ASR) and audio understanding. To facilitate genuine end-to-end speech conversation, Step-Audio 2 incorporates the generation of discrete audio tokens into language modeling, significantly enhancing its responsiveness to paralinguistic information such as speaking styles and em","authors_text":"Bingxin Li, Bin Wang, Binxing Jiao, Bo Li, Boyong Wu, Brian Li, Buyun Ma, Changhe Song, Changxin Miao, Changyi Wan, Chao Yan, Che Liu, Chengli Feng, Cheng Yi, Chen Hu, Chen Xu, Dapeng Shi, Daxin Jiang, Dingyuan Hu, Donghang Wu, Dongqing Pang, Enle Liu, Fei Tian, Feiyu Shen, Gang Yu, Guanzhe Huang, Gulin Yan, Guoqiang Hu, Haiyang Sun, Hanpeng Hu, Han Zhang, Haonan Jia, Hao Nie, Haoyang Zhang, Heung-Yeung Shum, Hongyu Zhou, Jiangjie Zhen, Jianjian Sun, Jiansheng Chen, Jiaoren Wu, Jie Wu, Jie Yang, Jingbei Li, Jing Li, Jin Yang, Junzhe Lin, Kaixiang Li, Kang An, Lei Yang, Liying Shi, Li Zhou, Longlong Gu, Ming Li, Mingliang Li, Mingrui Chen, Mingxiao Li, Nan Wu, Na Wang, Peng Liu, Qi Han, Qinyuan Tan, Shaoliang Pang, Shengjie Fan, Shuli Gao, Siqi Liu, Siyu Chen, Song Yuan, Tiancheng Cao, Wang You, Wanying Lu, Wei Ji, Wen Li, Wenqing He, Wen Sun, Wuxun Xie, Xiangyu Tony Zhang, Xiangyu Zhang, Xingyuan Li, Xuan Wen, Xuelin Zhang, Xueqi Li, Xuerui Yang, Xu Zhao, Yanbo Yu, Yang Yang, Yayue Deng, Yechang Huang, Yibo Zhu, Yifan Lu, Yilei Wang, Yi Liu, Yimin Jiang, Yong Ren, Yuanhao Ding, Yuankai Ma, Yuanwei Liang, Yuanwei Lu, Yuchu Luo, Yufan Lu, Yuhe Yin, Yumeng Zhan, Yuxiang Yang, Yuxiang Zhang, Yuxin Li, Yuxin Zhang, Yu Zhou, Zhao You, Zidong Yang, Zixin Zhang","cross_cats":["cs.SD","eess.AS"],"headline":"Step-Audio 2 integrates latent audio encoding and discrete token generation to deliver state-of-the-art audio understanding and expressive end-to-end speech conversation.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55Z","title":"Step-Audio 2 Technical Report"},"references":{"count":84,"internal_anchors":23,"resolved_work":84,"sample":[{"cited_arxiv_id":"2406.02430","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Seed-TTS: A Family of High-Quality Versatile Speech Generation Models","work_id":"6e88ee95-1133-4302-a142-cdf8f9456a8d","year":2024},{"cited_arxiv_id":"2305.10403","doi":"","is_internal_anchor":true,"ref_index":2,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","work_id":"453ebac7-2aaa-4384-b2aa-2f73ad059753","year":2020},{"cited_arxiv_id":"2309.16609","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Seed-asr: Understanding diverse speech and contexts with llm-based speech recognition","work_id":"c5c60033-9068-454d-8df1-52efb011f98b","year":2024}],"snapshot_sha256":"438837758a1d1de5f02c49a6453185cf913d569d494128e5acd5ea428f171eac"},"source":{"id":"2507.16632","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T05:55:34.925173Z","id":"48451f31-1b95-4ab4-9540-42d0a7c8faf5","model_set":{"reader":"grok-4.3"},"one_line_summary":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Step-Audio 2 integrates latent audio encoding and discrete token generation to deliver state-of-the-art audio understanding and expressive end-to-end speech conversation.","strongest_claim":"Step-Audio 2 achieves state-of-the-art performance on various audio understanding and conversational benchmarks compared to other open-source and commercial solutions.","weakest_assumption":"That the combination of latent audio encoding, reasoning-centric RL, discrete token generation, and RAG integration produces robust, generalizable performance on real-world conversational tasks beyond the reported benchmarks."}},"verdict_id":"48451f31-1b95-4ab4-9540-42d0a7c8faf5"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f627d7b45307a5424dc295b8cbffde7639461f244f62f0f6afa7fb9559294a01","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d058c8b802bb2451222fdb49d5ad5886e9f7b768f3dbf4d6cc331027cd01b905","cross_cats_sorted":["cs.SD","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55Z","title_canon_sha256":"b646849e8a7c531836a0955a51e33d7a166c3109cb62ee09014c8258dc259d25"},"schema_version":"1.0","source":{"id":"2507.16632","kind":"arxiv","version":3}},"canonical_sha256":"99a8494e0d80ddfbc4988266400e3e7da65ffb3402ae02d35f2cf6aeb0238abc","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"99a8494e0d80ddfbc4988266400e3e7da65ffb3402ae02d35f2cf6aeb0238abc","first_computed_at":"2026-05-17T23:38:48.906177Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.906177Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"OyKn5opcSa/7veOMlgwTDsTnyHccdo+D/9dxgqWi3cByrBVK/ygIN3Pu1zNRXoNjKc9I9jVIHJahtbVXBQRdBA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.906866Z","signed_message":"canonical_sha256_bytes"},"source_id":"2507.16632","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f627d7b45307a5424dc295b8cbffde7639461f244f62f0f6afa7fb9559294a01","sha256:12083b7a918a4d9c5902a3f553b307a68da77d604888548f931b82b3c2cc624a"],"state_sha256":"27902206677ae6131c27addc9bfff6823ba82e49193b2ed15ef18944baac4c93"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"qpwIHBAVez+A5tGJpEpIEKHoBOK+wIVHv20wjoPlugMAj10+BCeTtNqan67en0zxUSgl3v25m2ygFAfK0SAmDw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T18:24:58.428700Z","bundle_sha256":"5c5fafb7776f416d62ea212d2a4a7b50eb5c3bd8ae24efe981aff13bd5668021"}}