{"total":16,"items":[{"citing_arxiv_id":"2605.11098","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AffectCodec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling","primary_cat":"cs.SD","submitted_at":"2026-05-11T18:04:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AffectCodec is an emotion-guided neural speech codec that preserves emotional cues during quantization while maintaining semantic fidelity and prosodic naturalness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10199","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How Should LLMs Listen While Speaking? A Study of User-Stream Routing in Full-Duplex Spoken Dialogue","primary_cat":"cs.CL","submitted_at":"2026-05-11T08:46:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Channel fusion gives better semantic grounding and QA performance in full-duplex LLM dialogue but is vulnerable to context corruption during interruptions, while cross-attention routing is more robust at the cost of weaker integration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06765","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VITA-QinYu: Expressive Spoken Language Model for Role-Playing and Singing","primary_cat":"cs.CL","submitted_at":"2026-05-07T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VITA-QinYu is the first expressive end-to-end spoken language model supporting role-playing and singing alongside conversation, trained on 15.8K hours of data and outperforming prior models on expressiveness and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05927","ref_index":12,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Minimizing Modality Gap from the Input Side: Your Speech LLM Can Be a Prosody-Aware Text LLM","primary_cat":"cs.CL","submitted_at":"2026-05-07T09:32:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TextPro-SLM reduces the speech-text modality gap by feeding an LLM backbone with synchronized text tokens and prosody embeddings from WhisperPro, achieving lowest gap scores at 3B/7B scales with roughly 1,000 hours of audio.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00329","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Fast Text-to-Audio Generation with One-Step Sampling via Energy-Scoring and Auxiliary Contextual Representation Distillation","primary_cat":"cs.SD","submitted_at":"2026-05-01T01:13:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A one-step text-to-audio model using energy-distance training and contextual distillation outperforms prior fast baselines on AudioCaps and achieves up to 8.5x faster inference than the multi-step IMPACT system with competitive quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21406","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Full-Duplex Interaction in Spoken Dialogue Systems: A Comprehensive Study from the ICASSP 2026 HumDial Challenge","primary_cat":"eess.AS","submitted_at":"2026-04-23T08:21:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A new HumDial-FDBench benchmark and real human-recorded dual-channel dataset are released to assess full-duplex dialogue systems on interruptions and conversational flow.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20842","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SpeechParaling-Bench: A Comprehensive Benchmark for Paralinguistic-Aware Speech Generation","primary_cat":"cs.CL","submitted_at":"2026-04-22T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpeechParaling-Bench is a new evaluation framework for paralinguistic-aware speech generation that reveals major limitations in current large audio-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18489","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aligning Language Models for Lyric-to-Melody Generation with Rule-Based Musical Constraints","primary_cat":"cs.SD","submitted_at":"2026-04-20T16:40:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rule-generated preference data aligned via sequential DPO and KTO reduces musical constraint violations and improves coherence in lyric-to-melody generation over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14604","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Hijacking Large Audio-Language Models via Context-Agnostic and Imperceptible Auditory Prompt Injection","primary_cat":"cs.CR","submitted_at":"2026-04-16T04:22:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AudioHijack generates imperceptible adversarial audio via gradient estimation, attention supervision, and reverberation blending to hijack 13 LALMs with 79-96% success on unseen contexts and real commercial agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13804","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-15T12:39:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RoleJudge is a multidimensional evaluation framework for speech-character alignment in audio LLMs, backed by the RoleChat dataset and multi-stage RL training with standard alignment to reduce reward issues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11594","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HumDial-EIBench: A Human-Recorded Multi-Turn Emotional Intelligence Benchmark for Audio Language Models","primary_cat":"eess.AS","submitted_at":"2026-04-13T15:06:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HumDial-EIBench is a new benchmark using real human dialogues to evaluate audio language models on emotional intelligence tasks including multi-turn tracking, causal reasoning, empathy generation, and acoustic-semantic conflict resolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09222","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GRM: Utility-Aware Jailbreak Attacks on Audio LLMs via Gradient-Ratio Masking","primary_cat":"cs.SD","submitted_at":"2026-04-10T11:27:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRM ranks Mel bands by attack contribution versus utility sensitivity, perturbs a subset, and learns a universal perturbation to reach 88.46% average jailbreak success rate with improved attack-utility trade-off on four audio LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08003","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethinking Entropy Allocation in LLM-based ASR: Understanding the Dynamics between Speech Encoders and LLMs","primary_cat":"eess.AS","submitted_at":"2026-04-09T09:07:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-stage training method for LLM-based ASR uses new entropy allocation metrics to achieve competitive benchmark performance with 2.3B parameters while mitigating hallucinations via better encoder-LLM decoupling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.01897","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FastTurn: Unifying Acoustic and Streaming Semantic Cues for Low-Latency and Robust Turn Detection","primary_cat":"cs.SD","submitted_at":"2026-04-02T11:00:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FastTurn unifies acoustic features and streaming CTC decoding for low-latency, robust turn detection in full-duplex dialogue systems and releases a realistic human-dialogue test set.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.18425","ref_index":84,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Kimi-Audio Technical Report","primary_cat":"eess.AS","submitted_at":"2025-04-25T15:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kimi-Audio is an open-source audio foundation model that achieves state-of-the-art results on speech recognition, audio understanding, question answering, and conversation after pre-training on more than 13 million hours of speech, sound, and music data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.01743","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs","primary_cat":"cs.CL","submitted_at":"2025-03-03T17:05:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Phi-4-Mini achieves strong math and coding performance with only 3.8B parameters via high-quality synthetic data, while Phi-4-Multimodal uses Mixture-of-LoRAs to integrate modalities and top speech recognition leaderboards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}