{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:CQ2TMODXJUVTA7Q7VEIJWMHAVZ","short_pith_number":"pith:CQ2TMODX","schema_version":"1.0","canonical_sha256":"14353638774d2b307e1fa9109b30e0ae67a684cd88cfde236444760487a60c28","source":{"kind":"arxiv","id":"2606.07309","version":1},"attestation_state":"computed","paper":{"title":"Acoustic Cue Alignment in Audio Language Models for Speech Emotion Recognition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.SD","authors_text":"Andreas Triantafyllopoulos, Bj\\\"orn W. Schuller, Iosif Tsangko","submitted_at":"2026-06-05T14:26:06Z","abstract_excerpt":"Instruction-following audio language models (ALMs) can be augmented with explicit acoustic cues, yet it remains unclear whether such cues are used in a grounded way when the raw audio is already available. We study this question in speech emotion recognition (SER) by deriving six interpretable acoustic concept tokens from the standardised eGeMAPS paralinguistic feature set. These tokens summarise energy, pitch, dynamics, brightness, formants, and voice quality, and are appended to the textual prompt while the audio input is kept unchanged. Across the widely used FAU-Aibo and IEMOCAP benchmarks"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.07309","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2026-06-05T14:26:06Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"ce32138531c1497d1fd730c37a48b50f63f080756de46fb96b40a11a767a2dab","abstract_canon_sha256":"bb53d110f9fa115e2fcfa15d780a3d435ed63cd8cb86975786c00f8b78a247f8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-08T01:05:18.943290Z","signature_b64":"pJV78fuv0mLYODGHuIoJpoPpm4rRuB7kw8fxWRrqxMk/uB/iVdMZILJctGTF6g9HYcPAnLAn47jk4Gr614EkAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"14353638774d2b307e1fa9109b30e0ae67a684cd88cfde236444760487a60c28","last_reissued_at":"2026-06-08T01:05:18.942429Z","signature_status":"signed_v1","first_computed_at":"2026-06-08T01:05:18.942429Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Acoustic Cue Alignment in Audio Language Models for Speech Emotion Recognition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.SD","authors_text":"Andreas Triantafyllopoulos, Bj\\\"orn W. Schuller, Iosif Tsangko","submitted_at":"2026-06-05T14:26:06Z","abstract_excerpt":"Instruction-following audio language models (ALMs) can be augmented with explicit acoustic cues, yet it remains unclear whether such cues are used in a grounded way when the raw audio is already available. We study this question in speech emotion recognition (SER) by deriving six interpretable acoustic concept tokens from the standardised eGeMAPS paralinguistic feature set. These tokens summarise energy, pitch, dynamics, brightness, formants, and voice quality, and are appended to the textual prompt while the audio input is kept unchanged. Across the widely used FAU-Aibo and IEMOCAP benchmarks"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07309","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.07309/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.07309","created_at":"2026-06-08T01:05:18.942578+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.07309v1","created_at":"2026-06-08T01:05:18.942578+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07309","created_at":"2026-06-08T01:05:18.942578+00:00"},{"alias_kind":"pith_short_12","alias_value":"CQ2TMODXJUVT","created_at":"2026-06-08T01:05:18.942578+00:00"},{"alias_kind":"pith_short_16","alias_value":"CQ2TMODXJUVTA7Q7","created_at":"2026-06-08T01:05:18.942578+00:00"},{"alias_kind":"pith_short_8","alias_value":"CQ2TMODX","created_at":"2026-06-08T01:05:18.942578+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ","json":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ.json","graph_json":"https://pith.science/api/pith-number/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/graph.json","events_json":"https://pith.science/api/pith-number/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/events.json","paper":"https://pith.science/paper/CQ2TMODX"},"agent_actions":{"view_html":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ","download_json":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ.json","view_paper":"https://pith.science/paper/CQ2TMODX","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.07309&json=true","fetch_graph":"https://pith.science/api/pith-number/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/graph.json","fetch_events":"https://pith.science/api/pith-number/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/action/storage_attestation","attest_author":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/action/author_attestation","sign_citation":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/action/citation_signature","submit_replication":"https://pith.science/pith/CQ2TMODXJUVTA7Q7VEIJWMHAVZ/action/replication_record"}},"created_at":"2026-06-08T01:05:18.942578+00:00","updated_at":"2026-06-08T01:05:18.942578+00:00"}