{"total":22,"items":[{"citing_arxiv_id":"2606.31247","ref_index":216,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FlexiSLM: A Dynamic and Controllable Frame Rate Spoken Language Model","primary_cat":"cs.SD","submitted_at":"2026-06-30T07:24:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FlexiSLM is the first spoken language model supporting dynamic and controllable frame rates on speech input and output, outperforming fixed-rate 7B models at high quality and enabling faster inference at lower rates like 6.25 Hz.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30944","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Preserving Speech-to-Text LLM Capabilities in Speech-to-Speech Generation","primary_cat":"eess.AS","submitted_at":"2026-06-29T21:55:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRIME-Speech adds low-latency speech output to frozen S2T LLMs by synchronizing a causal post-decoder with intermediate hidden states and using mixed conditioning plus turn-level KV-cache packing, preserving original S2T performance across translation, QA, and dialogue tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30145","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FacePlex: Full-Duplex Joint Speech-Facial Motion Generation for Conversational Avatars","primary_cat":"cs.AI","submitted_at":"2026-06-29T11:22:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FacePlex introduces a unified streaming model with Rolling Flow Matching and Rolling Cross-Attention to enable full-duplex joint real-time generation of speech and facial motion tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13544","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Adaptive Turn-Taking for Real-time Multi-Party Voice Agents","primary_cat":"eess.AS","submitted_at":"2026-06-11T16:27:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ModeratorLM conditions a streaming speech LLM on assigned roles for adaptive turn-taking in multi-party settings, reporting over 40% higher precision and 70% higher recall than non-role baselines on real meetings and a new synthetic dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06559","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"IRAF: Interference-Resilient Adaptive Fusion for Noise-Robust End-to-End Full-Duplex Spoken Dialogue Systems","primary_cat":"cs.SD","submitted_at":"2026-06-04T12:39:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"IRAF introduces an adaptive fusion module that uses a predicted scalar reliability gate to reduce the impact of interfering speakers on user audio representations in end-to-end full-duplex spoken dialogue systems, with reported gains on MS-MARCO and InstructS2S-200K.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05121","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Audio Interaction Model","primary_cat":"cs.SD","submitted_at":"2026-06-03T17:26:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Audio-Interaction unifies offline and online audio tasks into one streaming model via the SoundFlow framework and a new 2.6M-item streaming corpus, enabling real-time instruction following and proactive responses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27190","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning When to Think While Listening in Large Audio-Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-26T15:43:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A wait-think-answer controller for LALMs is trained via SFT followed by six-reward DAPO, raising row-weighted accuracy from 67.6% to 70.3% and cutting post-endpoint thinking length by 14% on synthetic spoken QA while remaining functional on real recorded audio.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20755","ref_index":4,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DuplexSLA: A Full-Duplex Spoken Language Model with Synchronized Speech, Language, and Action","primary_cat":"eess.AS","submitted_at":"2026-05-20T05:54:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DuplexSLA introduces a three-channel full-duplex architecture that synchronizes continuous user audio, discrete assistant audio, and rate-limited textual actions inside a single backbone for native turn-taking and in-conversation tool use.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20266","ref_index":123,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Survey of Large Audio Language Models: Generalization, Trustworthiness, and Outlook","primary_cat":"cs.SD","submitted_at":"2026-05-18T20:21:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey of Large Audio Language Models that establishes a taxonomy of trustworthiness vulnerabilities and proposes a Defense-in-Depth roadmap for audio intelligence.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"1T text tokens✓ ✓ ✓ LLaMA-Omni [118] Sep 2024 Llama-3.1-8B-Instruct 8B EN Contin. -✗✓ ✓ Parrot [119] Sep 2024 Llama 3.1-8B 8B EN Discrete 74,554 Hrs audio✓✗✓ OmniFlatten [120] Oct 2024 Qwen2-0.5B 0.5B EN, CN Discrete -✓ ✓ ✓ IntrinsicVoice [121] Oct 2024 Qwen2-7B-Instruct 7B - Discrete 20K Hrs audio✗✓ ✓ DiVA [122] Oct 2024 Llama 3 8B EN Contin. -✗✓ ✓ Freeze-Omni [123] Nov 2024 Qwen2-7B-Instruct 7B EN, CN Contin. -✓ ✓ ✓ GLM-4-Voice [124] Dec 2024 GLM-4-9B 9B EN, CN Discrete 1T tokens✗✓ ✓ KE-Omni [125] Dec 2024 LLaMA-3.1-8B-Instruct 8B EN, CN Contin. -✗✓ ✓ MERaLiON-Audio [126] Dec 2024 SEA-LION V3 10B Multi. Contin. -✗✓ ✓ Year 2025 MinMo [60] Jan 2025 Qwen2.5-7B-Instruct 7B Multi. Contin. -✓ ✓ ✓ FireRedASR [12]"},{"citing_arxiv_id":"2605.06765","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VITA-QinYu: Expressive Spoken Language Model for Role-Playing and Singing","primary_cat":"cs.CL","submitted_at":"2026-05-07T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VITA-QinYu is the first expressive end-to-end spoken language model supporting role-playing and singing alongside conversation, trained on 15.8K hours of data and outperforming prior models on expressiveness and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21406","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Full-Duplex Interaction in Spoken Dialogue Systems: A Comprehensive Study from the ICASSP 2026 HumDial Challenge","primary_cat":"eess.AS","submitted_at":"2026-04-23T08:21:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A new HumDial-FDBench benchmark and real human-recorded dual-channel dataset are released to assess full-duplex dialogue systems on interruptions and conversational flow.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.01897","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FastTurn: Unifying Acoustic and Streaming Semantic Cues for Low-Latency and Robust Turn Detection","primary_cat":"cs.SD","submitted_at":"2026-04-02T11:00:37+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"though Easy Turn generates accurate outputs, it still faces la- tency issues due to the need to first output ASR results, and it has limited capacity to model complex acoustic information. Moreover, existing open-source dialogue corpora generally lack fine-grained turn-taking annotations, which limits the ability to reliably model and evaluate turn detection. Although some di- alogue datasets [15, 16] provide partial turn annotations, these datasets still fall short of meeting the demands of modern di- alogue systems-especially in real-world scenarios involving multiple participants, background noise, and natural speech in- teractions. Furthermore, many turn detection datasets and full- duplex interaction benchmarks [17, 18] are not derived from"},{"citing_arxiv_id":"2603.17837","ref_index":35,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Silent Thought: Modeling Internal Cognition in Full-Duplex Spoken Dialogue Models via Latent Reasoning","primary_cat":"eess.AS","submitted_at":"2026-03-18T15:30:29+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.22220","ref_index":74,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StableToken: A Noise-Robust Semantic Speech Tokenizer for Resilient SpeechLLMs","primary_cat":"cs.CL","submitted_at":"2025-09-26T11:32:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StableToken introduces a multi-branch architecture with bit-wise voting to create noise-robust semantic speech tokens, achieving lower Unit Edit Distance and better SpeechLLM robustness than prior single-path tokenizers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.14804","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Building Speech Large Language Models for Multitask Understanding in Low-Resource Languages","primary_cat":"cs.SD","submitted_at":"2025-09-18T09:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces XLSR-Thai encoder, U-Align alignment, and Thai-SUP data pipeline to enable multitask speech understanding SLLMs for Thai.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.10016","ref_index":20,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Training-Free Multimodal Large Language Model Orchestration","primary_cat":"cs.CL","submitted_at":"2025-08-06T16:17:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM Orchestration integrates modality experts via an LLM controller, cross-modal memory, and interaction layer to enable multimodal input-output without gradient-based training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.16632","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Step-Audio 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.14654","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Words: Multimodal LLM Knows When to Speak","primary_cat":"cs.CV","submitted_at":"2025-05-20T17:42:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MM-When2Speak reformulates conversational timing as dense response-type prediction and achieves up to 3x better performance by integrating video, audio, and text cues on top of an LLM backbone using a new dyadic conversation dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.18425","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Kimi-Audio Technical Report","primary_cat":"eess.AS","submitted_at":"2025-04-25T15:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kimi-Audio is an open-source audio foundation model that achieves state-of-the-art results on speech recognition, audio understanding, question answering, and conversation after pre-training on more than 13 million hours of speech, sound, and music data.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"end-to-end speech interaction. Moshi [14], GLM-4-V oice [84], and Mini-Omni [72] adopt inter- leaved or parallel decoding to support simultaneous generation of text and audio tokens, facilitating low-latency dialogue systems. OmniFlatten [86] introduces a progressive training pipeline to adapt a frozen LLM for full-duplex conversation. LLaMA-Omni [18] and Freeze-Omni [71] further refine duplex speech interaction through streaming decoders or multi-task alignment strategies. However, these systems often rely heavily on speech-only datasets and compromise language modeling quality or generality due to limited pre-training. Toward Universal Audio-Language Foundation Models A small number of recent works aim to unify understanding and generation within a single multimodal model."},{"citing_arxiv_id":"2502.11946","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction","primary_cat":"cs.CL","submitted_at":"2025-02-17T15:58:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step-Audio introduces a 130B-parameter unified speech-text model with open-sourced components for understanding, generation, affordable voice cloning, and dynamic control, claiming SOTA human evaluation results on a new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.01957","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","primary_cat":"cs.CV","submitted_at":"2025-01-03T18:59:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VITA-1.5 integrates vision and speech into a single LLM through multi-stage training, delivering competitive benchmark results on image, video, and speech tasks with near real-time response speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.02612","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot","primary_cat":"cs.CL","submitted_at":"2024-12-03T17:41:24+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GLM-4-Voice builds an end-to-end spoken chatbot by deriving a 175bps single-codebook tokenizer from ASR, synthesizing interleaved speech-text data, and continuing pre-training of GLM-4-9B on up to 1 trillion tokens before fine-tuning on conversational speech.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}