{"total":51,"items":[{"citing_arxiv_id":"2607.00247","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adaptive Perturbation Selection for Contrastive Audio Decoding","primary_cat":"cs.SD","submitted_at":"2026-06-30T22:55:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Adaptive selection among a library of audio perturbations in contrastive decoding produces task-dependent accuracy gains, including +4.3% on an existence task via a hidden-state selector.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24082","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Comparative Reasoning: Making an Audio Language Model Better at Comparing Emotions","primary_cat":"eess.AS","submitted_at":"2026-06-23T02:55:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A reasoning-guided ordinal SER framework conditions LALMs on paired speech, trains on semantic and GeMAPS-derived reasoning traces, and applies direct preference optimization to improve comparative emotion prediction with only 5% of conventional training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23243","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unlocking In-Context Learning in Audio-Language Models from Decentralized Medical Audio","primary_cat":"cs.LG","submitted_at":"2026-06-22T12:28:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FSC uses unsupervised clustering for pseudo-label episodes and a three-stage federated pipeline to achieve 71.6% accuracy in 2-way 2-shot in-context diagnosis of respiratory and cardiac audio conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22868","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MSU-Bench: Towards Speaker-Centric Understanding in Conversational Multi-Speaker Scenarios","primary_cat":"eess.AS","submitted_at":"2026-06-22T05:24:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSU-Bench is a new two-tier benchmark covering speaker grounding to dialogue reasoning in multi-speaker conversations, with Gemini-assisted annotation and human verification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22276","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning from Audio-Dependency Errors: Data Curation Strategies Based on Model Confusion Patterns in Audio Question Answering","primary_cat":"eess.AS","submitted_at":"2026-06-20T23:57:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Using model confusion patterns from counterfactual audio conditions to curate training data improves audio question answering accuracy from 65.90% to 67.27% on the development set.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17417","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Closer Look at Failure Modes in Temporal Understanding of Large Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2026-06-16T01:57:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a benchmark for mechanistic analysis of temporal failures in LALMs and shows attention scaling at bottleneck layers improves accuracy from 55.9% to 59.1%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.15088","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When the Same Musical Knowledge Forgets Differently: A Clean Probe of Pathway-Dependent Forgetting","primary_cat":"cs.SD","submitted_at":"2026-06-13T03:42:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Acquisition route affects forgetting rates in multimodal models, with text-pathway knowledge forgetting faster than audio-pathway knowledge in music understanding tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11400","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Steering Where to Listen: Instruction-Based Activation Steering Redirects Temporal Attention in Large Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2026-06-09T19:44:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Instruction-based vector steering redirects temporal attention in LALMs to acoustically relevant regions, recovering queried sound event locations with 60.87-68.72% overlap accuracy without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10147","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Senses to Decisions: The Information Flow of Auditory and Visual Perception in Multimodal LLMs","primary_cat":"cs.AI","submitted_at":"2026-06-08T20:26:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AVLLMs route audio-visual information sequentially in video tasks and via parallel streams for interleaved items, allowing early token discard with little performance loss across models and scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09366","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Is Text All You Need? Text as a Universal Information Bottleneck for Speech LLMs","primary_cat":"cs.CL","submitted_at":"2026-06-08T11:38:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"C-Gate represents speech frames as convex combinations of LLM token embeddings to enforce manifold compatibility, delivering up to 48.7% relative WER reduction on LibriSpeech while preserving emotion recognition accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08194","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GlobeAudio: A Multilingual Multicultural Benchmark for Naturalistic Evaluation of Large Audio-Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-06T14:24:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GlobeAudio is a new multilingual multicultural benchmark for naturalistic evaluation of large audio-language models, showing performance gaps especially for open-source models and low-resource languages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07264","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VISA: A Visual Information Strengthened Audio-Reasoning System for the Interspeech 2026 ARC Agent Track","primary_cat":"eess.AS","submitted_at":"2026-06-05T13:39:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"VISA ranks 2nd in the Interspeech 2026 ARC Agent Track by adding multi-modal feature extraction, consistency-checked model voting, and rubric-aligned routing to large audio language models, reaching 66.23% Rubrics score and 77.40% accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18273","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Continuous Audio Thinking for Large Audio Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-05T11:38:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoAT adds a continuous latent thinking space to LALMs via expert distillation to retain acoustic information, yielding gains on audio reasoning, understanding, music, emotion, and transcription benchmarks across three models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05121","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Audio Interaction Model","primary_cat":"cs.SD","submitted_at":"2026-06-03T17:26:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Audio-Interaction unifies offline and online audio tasks into one streaming model via the SoundFlow framework and a new 2.6M-item streaming corpus, enabling real-time instruction following and proactive responses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03283","ref_index":25,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpeakerCard-1M: An Evidence-Grounded Corpus for In-the-Wild Speaker Verification","primary_cat":"eess.AS","submitted_at":"2026-06-02T07:49:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpeakerCard-1M supplies 56.7k evidence-grounded speaker cards, 1.78M captions, and new cross-modal protocols showing audio LMs lag a dual-encoder baseline on attribute-conditioned verification while joint training barely hurts standard EER.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28480","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Audio-Mind: An Auditable Agentic Framework for Audio Understanding","primary_cat":"eess.AS","submitted_at":"2026-05-27T13:39:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Audio-Mind introduces a conditional, auditable agentic framework for audio understanding that preserves frontend judgment and acquires bounded external evidence only when needed, reporting 80.4% on MMAR and 82.8% on MSU-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28063","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unified Synthesis of Compositional Speech and Sound from Free-Form Text Prompts","primary_cat":"cs.SD","submitted_at":"2026-05-27T07:15:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PlanAudio introduces a unified autoregressive LLM framework with semantic latent chain-of-thought for generating composite speech and sound audio from free-form text, plus a new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27741","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Escape the Language Prior: Mitigating Late-Stage Modality Collapse in Audio Reasoning via Modality-Aware Policy Optimization","primary_cat":"cs.CL","submitted_at":"2026-05-26T22:34:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAPO is a dual-branch RL framework using modality relevance masks from cross-modal differential entropy and auxiliary attention losses to reduce late-stage modality collapse in audio reasoning models and improve benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27190","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning When to Think While Listening in Large Audio-Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-26T15:43:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A wait-think-answer controller for LALMs is trained via SFT followed by six-reward DAPO, raising row-weighted accuracy from 67.6% to 70.3% and cutting post-endpoint thinking length by 14% on synthetic spoken QA while remaining functional on real recorded audio.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20519","ref_index":50,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Codec-Robust Attacks on Audio LLMs","primary_cat":"cs.SD","submitted_at":"2026-05-19T21:39:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CodecAttack perturbs audio in codec latent space with multi-bitrate EoT to achieve 85.5% average ASR on Opus-compressed Audio LLMs versus under 26% for waveform baselines, with transfer to MP3 and AAC.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19101","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Heterogeneity-Aware Dataset Scheduling for Efficient Audio Large Language Model Training","primary_cat":"cs.SD","submitted_at":"2026-05-18T20:41:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GST uses gradient-based affinity metrics to form dataset groups and applies progressive scheduling, achieving 30-40% faster convergence than uniform mixture training on 14 AudioQA datasets while matching or exceeding performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20266","ref_index":132,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey of Large Audio Language Models: Generalization, Trustworthiness, and Outlook","primary_cat":"cs.SD","submitted_at":"2026-05-18T20:21:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey of Large Audio Language Models that establishes a taxonomy of trustworthiness vulnerabilities and proposes a Defense-in-Depth roadmap for audio intelligence.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Feb 2025 Baichuan-Audio-Base 7B EN, CN Discrete 887K Hrs audio + 100B tokens✗✓ ✓ Audio Flamingo 2 [129] Mar 2025 Qwen2.5-3B 3B EN Contin. 8M+ audio-caption pairs✗✓ ✓ Kimi-Audio [130] Apr 2025 Qwen2.5-7B 7B EN, CN Hybrid 13M+ Hrs audio✗✓ ✓ VITA-Audio [131] May 2025 Qwen2.5-7B-Instruct 7B EN, CN Discrete 200K Hrs audio✗✓ ✓ Step-Audio 2 [19] Jul 2025 - - Multi. Contin. 680B tokens and 8M Hrs audio✗✓ ✓ Audio Flamingo 3 [132] Jul 2025 Qwen2.5-7B 7B EN Contin. -✗✓ ✓ DeSTA2.5-Audio [133] Jul 2025 Llama3.1-8B-Instruct 8B EN Contin. 7K Hrs audio✗✓ ✓ FireRedChat [134] Sep 2025 Qwen2.5 - EN, CN - -✓ ✓ ✓ Falcon3-Audio [135] Sep 2025 Falcon3-Instruct 1/3/7B EN Contin. -✗✓ ✓ Step-Audio-R1 [127] Nov 2025 Qwen2.5-32B 32B EN, CN Contin. 1.356T tokens✗✓ ✓ Step-Audio-EditX [136]"},{"citing_arxiv_id":"2605.17370","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CBT-Audio: Evaluating Audio Language Models for Patient-Side Distress Intensity Estimation in CBT Session Recordings","primary_cat":"cs.AI","submitted_at":"2026-05-17T10:27:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CBT-Audio dataset shows that adding audio input improves distress intensity estimation over transcripts alone for 8 of 10 audio language models, with clearest gains when verbal content and vocal delivery diverge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17225","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can Large Audio Language Models Ignore Multilingual Distractors? An Evaluation of Their Selective Auditory Attention Capabilities","primary_cat":"eess.AS","submitted_at":"2026-05-17T02:13:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces the MUSA benchmark and evaluates LALMs showing that strong single-speaker performance fails to ensure robust selective attention under multilingual interference, with errors from source confusion and unresolved attribution after separation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14231","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AudioMosaic: Contrastive Masked Audio Representation Learning","primary_cat":"cs.LG","submitted_at":"2026-05-14T00:56:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AudioMosaic learns general-purpose audio representations through contrastive pre-training with structured spectrogram masking, reaching state-of-the-art results on standard benchmarks and improving audio-language tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13931","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FSD50K-Solo: Automated Curation of Single-Source Sound Events","primary_cat":"eess.AS","submitted_at":"2026-05-13T16:04:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A curation pipeline combining diffusion-based synthetic mixtures with a discriminative classifier produces and releases FSD50K-Solo, a single-source subset of FSD50K that matches human expert labels on a test set.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12036","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Fine-Grained Multi-Dimensional Speech Understanding: Data Pipeline, Benchmark, and Model","primary_cat":"eess.AS","submitted_at":"2026-05-12T12:19:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A data pipeline, 14-dimension benchmark, and decoupled fine-tuning model are presented to advance fine-grained multi-dimensional speech understanding in LLMs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"taxonomy:Speaker Demographics,Acoustic-Prosodic Features, Affective and Semantic Reasoning,Acoustic Scene Analysis, Linguistic-Paralinguistic Integration. This hierarchical design en- ables a holistic assessment of how well a model comprehends fine- grained, multi-dimensional real-world speech. From the modeling perspective, current speech understanding LLMs (e.g., Qwen3-Omni [7], Audio Flamingo 3 [8]) face several critical limitations. They often exhibit restricted or entangled model- ing of speech attributes, frequently yielding coarse or single-label out- puts. Furthermore, they suffer from text-conditioned hallucinations, overly depending on linguistic priors while neglecting actual acoustic evidence. To overcome these limitations, we introduceFM-Speech, a"},{"citing_arxiv_id":"2606.11219","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Afrispeech Semantics: Evaluating Audio Semantic Reasoning in Spoken Language Models Across Domains and Accents","primary_cat":"cs.CL","submitted_at":"2026-05-11T20:27:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Audio language models are benchmarked on five semantic and paralinguistic reasoning tasks to reveal limitations in handling spoken audio evidence, accent variation, and domain shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04505","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"JASTIN: Aligning LLMs for Zero-Shot Audio and Speech Evaluation via Natural Language Instructions","primary_cat":"eess.AS","submitted_at":"2026-05-06T05:18:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"JASTIN is an instruction-driven audio evaluation system that achieves state-of-the-art correlation with human ratings on speech, sound, music, and out-of-domain tasks without task-specific retraining.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"1) Non-LLM metrics:We utilize AES model [ 13] with its CE, CU, PC, and PQ metrics, UTMOS [ 11], and NISQA [ 10] as our baselines. 2) General-purpose LLMs:We choose several MLLM models as baselines, including Gemini series ( Gemini-3- Pro, Gemini-2.5-Pro, and Gemini-2.5-Flash) [ 15], Qwen series (Qwen3-omni [28], Qwen2-audio [ 52]), and Nvidia's Audio Flamingo3 [53]. 7https://github.com/vivian556123/Jastin JOURNAL OF LATEX CLASS FILES, VOL. 18, NO. 9, SEPTEMBER 2020 6 TABLE I: Comparison between ourJASTINand baseline models on Speech-only Datasets. Model QualiSpeech SpeechEval Noise Dist. Cont. Listen. Nat. Ovrl. Ovrl. Int. Dist. Dyn. Emo. Art. Subj. Pearson Correlation (PCC↑) QualiSpeech∗ [21] 0.6860.518 0."},{"citing_arxiv_id":"2605.03352","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Can Multimodal Large Language Models Understand Pathologic Movements? A Pilot Study on Seizure Semiology","primary_cat":"cs.CV","submitted_at":"2026-05-05T04:14:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MLLMs achieve zero-shot recognition of seizure semiological features better than fine-tuned vision models on most tested features, with signal enhancement and faithful explanations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00371","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GaMMA: Towards Joint Global-Temporal Music Understanding in Large Multimodal Models","primary_cat":"cs.SD","submitted_at":"2026-05-01T03:21:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GaMMA unifies global and temporal music understanding in a single LMM via MoE audio encoders and progressive training, achieving new state-of-the-art accuracies on music benchmarks including 79.1% on MuchoMusic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23717","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HeadRouter: Dynamic Head-Weight Routing for Task-Adaptive Audio Token Pruning in Large Audio Language Models","primary_cat":"cs.SD","submitted_at":"2026-04-26T14:00:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HeadRouter prunes audio tokens more effectively by dynamically routing based on per-head importance for semantic versus acoustic tasks, exceeding baseline performance at 70% token retention on Qwen2.5-Omni models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23323","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Robust Audio-Text Retrieval via Cross-Modal Attention and Hybrid Loss","primary_cat":"cs.CL","submitted_at":"2026-04-25T14:17:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A cross-modal attention refinement module plus hybrid loss improves robustness of audio-text retrieval on noisy and long-form audio.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22245","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Listening with Time: Precise Temporal Awareness for Long-Form Audio Understanding","primary_cat":"eess.AS","submitted_at":"2026-04-24T05:40:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LAT-Audio introduces a global-to-local reasoning approach with TWA-CoT that outperforms prior models on temporal tasks for audio up to 30 minutes.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"FTAR EN 2min S, D TAG, DAC✓ LAT-Chronicle EN, ZH 30min S, D, M TAG, DAC, TAC✓ Benchmark BLAB EN 120min S TAG✓ FTAR-test EN 2min S, D TAG, DAC✓ LAT-Bench EN, ZH 30min S, D, M TAG, DAC, TAC✓ 2.2 Long-form Audio Understanding Methods Existing LALMs typically encode audio inputs into embedding se- quences via audio encoders, which are then processed by LLMs [8]. Listening with Time: Precise Temporal Awareness for Long-Form Audio Understanding Arxiv, Preprint, 2026 The high audio frame rate results in extremely long input sequences, especially in long-form scenarios. To handle such inputs, exist- ing approaches mainly adopt two strategies. The first extends the context length of LLMs for direct long-context modeling like"},{"citing_arxiv_id":"2604.19300","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HalluAudio: A Comprehensive Benchmark for Hallucination Detection in Large Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2026-04-21T10:05:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"HalluAudio is the first large-scale benchmark spanning speech, environmental sound, and music that uses human-verified QA pairs, adversarial prompts, and mixed-audio tests to measure hallucinations in large audio-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18360","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Omni-Embed-Audio: Leveraging Multimodal LLMs for Robust Audio-Text Retrieval","primary_cat":"cs.SD","submitted_at":"2026-04-20T14:50:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18187","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Audio-DeepThinker: Progressive Reasoning-Aware Reinforcement Learning for High-Quality Chain-of-Thought Emergence in Audio Language Models","primary_cat":"cs.SD","submitted_at":"2026-04-20T12:43:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A hybrid-reward progressive RL curriculum enables high-quality chain-of-thought to emerge in audio language models without prior supervised CoT training, yielding SOTA results on MMAR, MMAU, and MMSU benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ods such as R1-AQA [11], Omni-R1 [12], and AudioMCQ [ 13] apply GRPO [ 14] with accuracy and format rewards, demon- strating that RL can improve audio QA performance. More re- cent works have begun to incorporate reasoning-related signals. For example, Audio-Thinker [ 15] introduces adaptive rewards to guide when the model should reason, and CESAR [ 16] intro- duces a comprehensive suite to reward structured patterns and causal logic. Despite these advances, two fundamental challenges re- main: (i) how to ensure that the generated reasoning chains are genuinely grounded in the audio content, rather than be- ing formally well-formatted yet semantically decoupled from the actual acoustic evidence; and (ii) how to fundamentally im-"},{"citing_arxiv_id":"2604.16659","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Benign Fine-Tuning Breaks Safety Alignment in Audio LLMs","primary_cat":"cs.CR","submitted_at":"2026-04-17T19:28:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Benign fine-tuning on audio data breaks safety alignment in Audio LLMs by raising jailbreak success rates up to 87%, with the dominant risk axis depending on model architecture and embedding proximity to harmful content.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22821","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Audio2Tool: Speak, Call, Act -- A Dataset for Benchmarking Speech Tool Use","primary_cat":"cs.SD","submitted_at":"2026-04-17T16:41:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Audio2Tool is a new benchmark dataset that shows speech models perform well on simple commands but degrade sharply on compositional tasks and realistic acoustic noise.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14920","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Dual-Axis Generative Reward Model Toward Semantic and Turn-taking Robustness in Interactive Spoken Dialogue Models","primary_cat":"cs.AI","submitted_at":"2026-04-16T12:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A generative reward model supplies separate semantic and turn-taking scores for spoken dialogues to enable more reliable reinforcement learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13023","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SpotSound: Enhancing Large Audio-Language Models with Fine-Grained Temporal Grounding","primary_cat":"cs.SD","submitted_at":"2026-04-14T17:57:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpotSound adds a hallucination-suppressing objective and a needle-in-haystack benchmark to audio-language models, reaching state-of-the-art temporal grounding while keeping general task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12527","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Audio-Cogito: Towards Deep Audio Reasoning in Large Audio Language Models","primary_cat":"eess.AS","submitted_at":"2026-04-14T10:00:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Audio-Cogito is an open-source LALM using Cogito-pipe data curation and self-distillation to achieve leading open-source performance on audio reasoning benchmarks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Especially in complex acoustic environments, **indicates the corresponding author. they remain susceptible to logical inconsistencies and the mis- interpretation of subtle acoustic cues. We attribute these limi- tations primarily to the scarcity of high-quality audio reasoning datasets. Current public audio datasets, such as AudioSet [27], AudioCaps [28], and Clotho [29], typically provide brief labels or captions that are insufficient to cultivate deep audio reason- ing. While a handful of audio reasoning datasets exist [23, 30], they predominantly focus on shallow reasoning tasks. Further- more, constructing datasets with complex reasoning traces re- lies heavily on closed-source models like Gemini 2."},{"citing_arxiv_id":"2604.09021","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Noise-Aware In-Context Learning for Hallucination Mitigation in ALLMs","primary_cat":"cs.SD","submitted_at":"2026-04-10T06:35:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NAICL reduces hallucination rates in ALLMs from 26.53% to 16.98% via noise priors in context and introduces the Clotho-1K benchmark with four hallucination types.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08003","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Entropy Allocation in LLM-based ASR: Understanding the Dynamics between Speech Encoders and LLMs","primary_cat":"eess.AS","submitted_at":"2026-04-09T09:07:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-stage training method for LLM-based ASR uses new entropy allocation metrics to achieve competitive benchmark performance with 2.3B parameters while mitigating hallucinations via better encoder-LLM decoupling.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"this end, we monitor changes in the representation distribution on a fixed validation set using CKA scores, as described in Section 3.4. Given the current encoder checkpoint Ecur and a reference checkpoint Eref, we compute the CKA score between their representations and trigger an update once the score falls below a predefined thresholdτ: CKA(Ecur,E ref)< τ,(11) Given two sets of encoder representationsE (a), E(b) ∈R L×de extracted from the same evaluation set, CKA is defined as CKA(E(a), E(b)) = ⟨ ˜K (a), ˜K (b)⟩Fq ⟨ ˜K (a), ˜K (a)⟩F · ⟨ ˜K (b), ˜K (b)⟩F ,(12) where ˜K (a) and ˜K (b) are centered Gram matrices calculated as ˜K (x) =CE (x)E(x)⊤C. The centering matrix is defined as C=I L − 1 L JL, where IL is the identity matrix and JL is the all-ones matrix."},{"citing_arxiv_id":"2604.03074","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Speaker-Reasoner: Scaling Interaction Turns and Reasoning Patterns for Timestamped Speaker-Attributed ASR","primary_cat":"eess.AS","submitted_at":"2026-04-03T14:52:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Speaker-Reasoner is an end-to-end speech LLM that iteratively analyzes audio structure, predicts temporal boundaries, and jointly models speaker identity, gender, timestamps, and transcription using a speaker-aware cache for long audio.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Wang, J. He, Y . Wang, X. Shi, T. He, X. Zhu, Y . Lv, Y . Wang, D. Guo, H. Wang, L. Ma, P. Zhang, X. Zhang, H. Hao, Z. Guo, B. Yang, B. Zhang, Z. Ma, X. Wei, S. Bai, K. Chen, X. Liu, P. Wang, M. Yang, D. Liu, X. Ren, B. Zheng, R. Men, F. Zhou, B. Yu, J. Yang, L. Yu, J. Zhou, and J. Lin, \"Qwen3-omni technical report,\"CoRR, vol. abs/2509.17765, 2025. [17] Z. Peng, J. Yu, Y . Chang, Z. Wang, L. Dong, Y . Hao, Y . Tu, C. Yang, W. Wang, S. Xu, Y . Sun, H. Bao, W. Xu, Y . Zhu, Z. Wang, T. Song, Y . Xia, Z. Chi, S. Huang, L. Wang, C. Ding, S. Wang, X. Chen, and F. Wei, \"VIBEVOICE-ASR technical re- port,\"CoRR, vol. abs/2601.18184, 2026. [18] M. Huo, Y . Shao, and Y . Zhang, \"Tagspeech: End-to-end multi-"},{"citing_arxiv_id":"2602.22029","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MIDI-Informed Singing Accompaniment Generation in a Compositional Song Pipeline","primary_cat":"cs.SD","submitted_at":"2026-02-24T06:43:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MIDI-SAG generates consistent long-form singing accompaniments by feeding symbolic MIDI timing, chords, and structure labels into a compositional pipeline built from pre-trained modules.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.20898","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reducing Prompt Sensitivity in LLM-based Speech Recognition Through Learnable Projection","primary_cat":"eess.AS","submitted_at":"2026-01-28T09:50:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A learnable prompt projector added to LLM-based ASR reduces prompt sensitivity, lowers performance variability, and beats the best fixed prompts on four datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.02954","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The World is Not Mono: Enabling Spatial Understanding in Large Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2026-01-06T11:54:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TWNM framework equips audio-language models with spatial scene analysis via FOA simulation and metadata-grounded training, reaching 70.8% accuracy on a new ASA benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.00626","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Silence Matters: The Impact of Irrelevant Audio on Text Reasoning in Large Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2025-10-01T07:59:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Irrelevant audio including silence reduces accuracy and increases volatility in text reasoning for large audio-language models, with effects worsening at longer durations, higher amplitudes, and higher temperatures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.17765","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Qwen3-Omni Technical Report","primary_cat":"cs.CL","submitted_at":"2025-09-22T13:26:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Qwen3-Omni is a unified multimodal model that achieves open-source SOTA on 32 of 36 audio and audio-visual benchmarks and overall SOTA on 22 without degrading performance on text, image, or video relative to single-modal Qwen counterparts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}