{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:4FKL4HFLF5UH5IQ7CKH7VY4YO6","short_pith_number":"pith:4FKL4HFL","schema_version":"1.0","canonical_sha256":"e154be1cab2f687ea21f128ffae39877942d0f11a95a5a40b8feaef87d7959af","source":{"kind":"arxiv","id":"2606.24082","version":1},"attestation_state":"computed","paper":{"title":"Comparative Reasoning: Making an Audio Language Model Better at Comparing Emotions","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"eess.AS","authors_text":"Abinay Reddy Naini, Carlos Busso, Chao-Han Huck Yang, Jaeyeon Kim, Shinji Watanabe","submitted_at":"2026-06-23T02:55:36Z","abstract_excerpt":"Large audio-language models (LALMs) can reason about audio, yet it remains unclear whether they can perform comparative judgments between two speech signals along emotional, environmental, linguistic, prosodic, and interpersonal dimensions. We study this question in the context of speech emotion recognition (SER), where the model determines which utterance exhibits higher arousal, valence, or dominance. We introduce a reasoning-guided ordinal SER framework that conditions an LALM on paired speech inputs. The model is trained using reasoning traces generated from both semantic audio description"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.24082","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"eess.AS","submitted_at":"2026-06-23T02:55:36Z","cross_cats_sorted":["cs.SD"],"title_canon_sha256":"0bc83c2e9d69dff9163f4985f6c2dc322193ae9a35c36e30a1e75db5ef243dd3","abstract_canon_sha256":"c86269c4669e5c013413241d6217fac6c0b118e83220a2a0beec2f248c9954e5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-24T01:14:39.839508Z","signature_b64":"65gEc9vvfrparLE3RE25QRvgPIjao9bBhYpr7UCMVSP5i+dJiN14q3mDU3r+8TlkAc+/+Sq5nH3JnJAlkydsAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e154be1cab2f687ea21f128ffae39877942d0f11a95a5a40b8feaef87d7959af","last_reissued_at":"2026-06-24T01:14:39.839134Z","signature_status":"signed_v1","first_computed_at":"2026-06-24T01:14:39.839134Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Comparative Reasoning: Making an Audio Language Model Better at Comparing Emotions","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"eess.AS","authors_text":"Abinay Reddy Naini, Carlos Busso, Chao-Han Huck Yang, Jaeyeon Kim, Shinji Watanabe","submitted_at":"2026-06-23T02:55:36Z","abstract_excerpt":"Large audio-language models (LALMs) can reason about audio, yet it remains unclear whether they can perform comparative judgments between two speech signals along emotional, environmental, linguistic, prosodic, and interpersonal dimensions. We study this question in the context of speech emotion recognition (SER), where the model determines which utterance exhibits higher arousal, valence, or dominance. We introduce a reasoning-guided ordinal SER framework that conditions an LALM on paired speech inputs. The model is trained using reasoning traces generated from both semantic audio description"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.24082","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.24082/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.24082","created_at":"2026-06-24T01:14:39.839192+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.24082v1","created_at":"2026-06-24T01:14:39.839192+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.24082","created_at":"2026-06-24T01:14:39.839192+00:00"},{"alias_kind":"pith_short_12","alias_value":"4FKL4HFLF5UH","created_at":"2026-06-24T01:14:39.839192+00:00"},{"alias_kind":"pith_short_16","alias_value":"4FKL4HFLF5UH5IQ7","created_at":"2026-06-24T01:14:39.839192+00:00"},{"alias_kind":"pith_short_8","alias_value":"4FKL4HFL","created_at":"2026-06-24T01:14:39.839192+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2606.24082","citing_title":"Comparative Reasoning: Making an Audio Language Model Better at Comparing Emotions","ref_index":1,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6","json":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6.json","graph_json":"https://pith.science/api/pith-number/4FKL4HFLF5UH5IQ7CKH7VY4YO6/graph.json","events_json":"https://pith.science/api/pith-number/4FKL4HFLF5UH5IQ7CKH7VY4YO6/events.json","paper":"https://pith.science/paper/4FKL4HFL"},"agent_actions":{"view_html":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6","download_json":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6.json","view_paper":"https://pith.science/paper/4FKL4HFL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.24082&json=true","fetch_graph":"https://pith.science/api/pith-number/4FKL4HFLF5UH5IQ7CKH7VY4YO6/graph.json","fetch_events":"https://pith.science/api/pith-number/4FKL4HFLF5UH5IQ7CKH7VY4YO6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6/action/storage_attestation","attest_author":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6/action/author_attestation","sign_citation":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6/action/citation_signature","submit_replication":"https://pith.science/pith/4FKL4HFLF5UH5IQ7CKH7VY4YO6/action/replication_record"}},"created_at":"2026-06-24T01:14:39.839192+00:00","updated_at":"2026-06-24T01:14:39.839192+00:00"}