{"total":18,"items":[{"citing_arxiv_id":"2606.31247","ref_index":118,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FlexiSLM: A Dynamic and Controllable Frame Rate Spoken Language Model","primary_cat":"cs.SD","submitted_at":"2026-06-30T07:24:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FlexiSLM is the first spoken language model supporting dynamic and controllable frame rates on speech input and output, outperforming fixed-rate 7B models at high quality and enabling faster inference at lower rates like 6.25 Hz.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31128","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UniSAE: Unified Speech Attribute Editing on Speaker, Emotion and Low-Level Content via Discrete Phonetic Posteriorgram Modelling","primary_cat":"cs.SD","submitted_at":"2026-06-30T04:46:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniSAE unifies speaker, emotion, and multi-granularity content editing in speech via a new discrete phonetic posteriorgram representation and diffusion-based rendering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20101","ref_index":56,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Hybrid Diffusion Transformer for Instruction-Guided Audio Editing via Rectified Flow","primary_cat":"cs.SD","submitted_at":"2026-06-18T11:20:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hybrid two-stage diffusion transformer architecture for instruction-guided audio editing via rectified flow that performs joint attention at low resolution then alternates joint and cross-attention at high resolution for improved performance and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03455","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WavTTS: Towards High-Quality Zero-Shot TTS via Direct Raw Waveform Modeling","primary_cat":"eess.AS","submitted_at":"2026-06-02T10:33:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"WavTTS is the first raw-waveform diffusion TTS model using DiT flow matching and multi-scale mel supervision that approaches SOTA latent zero-shot performance while beating prior end-to-end models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29531","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Audio Deepfake Detection with Half-Truth Localisation Using Cross-Attentive Feature Fusion","primary_cat":"cs.SD","submitted_at":"2026-05-28T07:47:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CAFNet performs joint ternary classification and temporal boundary regression for half-truth audio deepfakes via cross-attentive fusion of MFCC, LFCC, and Chroma-STFT features, reporting 92.71% accuracy and 0.075s MAE on MLADDC T2+T3.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26672","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Can We Hear from Events? Generating Speech from Event Camera","primary_cat":"cs.MM","submitted_at":"2026-05-26T08:11:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EventSpeech is a text-conditioned neural framework that uses neuromorphic event cameras and a new EVT-SPK benchmark to generate expressive speech, claiming to outperform RGB baselines by preserving fine-grained emotions without motion blur.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18749","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WavFlow: Audio Generation in Waveform Space","primary_cat":"cs.SD","submitted_at":"2026-05-18T17:59:10+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WavFlow performs direct waveform audio generation via flow matching on 2D token grids from raw patches plus amplitude lifting, matching latent-based methods on VGGSound and AudioCaps without intermediate compression.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17085","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Taming Audio VAEs via Target-KL Regularization","primary_cat":"cs.SD","submitted_at":"2026-05-16T17:01:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces target-KL regularization to train audio VAEs at specific bitrates, enabling rate-distortion curves and comparison to discrete audio codecs for improved text-to-sound generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16681","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Survey of Advancing Audio Super-Resolution and Bandwidth Extension from Discriminative to Generative Models","primary_cat":"eess.AS","submitted_at":"2026-05-15T22:34:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A structured survey of audio bandwidth extension that organizes the transition from deterministic discriminative DNNs to generative approaches including GANs, diffusion models, and flow-based methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15831","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Modeling Music as a Time-Frequency Image: A 2D Tokenizer for Music Generation","primary_cat":"cs.SD","submitted_at":"2026-05-15T10:35:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BandTok tokenizes Mel-spectrograms as independent time-frequency band tokens from a single codebook and pairs it with 2D RoPE in an autoregressive model to improve music generation over residual multi-codebook tokenizers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17986","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latent Fourier Transform","primary_cat":"cs.SD","submitted_at":"2026-04-20T09:08:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LatentFT uses latent-space Fourier transforms and frequency masking in diffusion autoencoders to enable timescale-specific manipulation of musical structure in generative models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.01284","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation","primary_cat":"cs.MM","submitted_at":"2025-09-30T21:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A single generative model uses twin DiT backbones with blockwise cross-attention and scaled-RoPE timing exchange to synthesize synchronized audio-video directly.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.16632","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Step-Audio 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.24437","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SwitchCodec: A High-Fidelity Nerual Audio Codec With Sparse Quantization","primary_cat":"cs.SD","submitted_at":"2025-05-30T10:20:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SwitchCodec introduces Residual Experts Vector Quantization and a multi-tiered STFT discriminator to achieve PESQ 2.87 and ViSQOL 4.27 at 2.67 kbps while halving training time via post-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.18425","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Kimi-Audio Technical Report","primary_cat":"eess.AS","submitted_at":"2025-04-25T15:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kimi-Audio is an open-source audio foundation model that achieves state-of-the-art results on speech recognition, audio understanding, question answering, and conversation after pre-training on more than 13 million hours of speech, sound, and music data.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"forward step will mix mi with Gaussian noise and the backward step will remove noise to obtain clean mi with condition ad i and prompt cj, where j < i , and cj contains both mj and ad j. With this design, during inference, when the LLM generates a chunk, we employ the flow-matching model to detokenize it to obtain the mel-spectrograms. Finally, we apply a BigVGAN [38] vocoder to generate wavforms for each chunk. Look-Ahead Mechanism. With a preliminary study, we find that the generated audio in the boundaries of chunks still has an intermittent issue. Although a long range of history context has been seen during the diffusion denoising process, the future context of the boundary position cannot be seen due to the nature of block-wise causal attention, which causes the degradation of"},{"citing_arxiv_id":"2410.13720","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Movie Gen: A Cast of Media Foundation Models","primary_cat":"cs.CV","submitted_at":"2024-10-17T16:22:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A 30B-parameter transformer and related models generate high-quality videos and audio, claiming state-of-the-art results on text-to-video, video editing, personalization, and audio generation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.06885","ref_index":115,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching","primary_cat":"eess.AS","submitted_at":"2024-10-09T13:46:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"F5-TTS generates natural speech from text via flow matching on DiT with simple text padding, ConvNeXt refinement, and sway sampling, trained on 100K hours multilingual data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.02430","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Seed-TTS: A Family of High-Quality Versatile Speech Generation Models","primary_cat":"eess.AS","submitted_at":"2024-06-04T15:48:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Seed-TTS models produce speech matching human naturalness and speaker similarity, with added controllability via self-distillation and reinforcement learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}