{"total":2,"items":[{"citing_arxiv_id":"2606.04418","ref_index":54,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CleanCodec: Efficient and Robust Speech Tokenization via Perceptually Guided Encoding","primary_cat":"cs.SD","submitted_at":"2026-06-03T03:56:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CleanCodec reframes audio tokenization as a selective information bottleneck to encode only perceptually important features at 12.5 tokens per second, outperforming prior codecs in efficiency, speaker similarity, and intelligibility.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00407","ref_index":75,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Privacy-preserving Prosody Representation Learning","primary_cat":"eess.AS","submitted_at":"2026-05-29T22:49:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A self-supervised prosody encoder with speaker disentanglement strategies outperforms raw prosody and HuBERT baselines on pitch reconstruction and prosodic event detection while achieving strong speaker separation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}