{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:WZEEDQYE4C6LG6QS6EXQM2YE3J","short_pith_number":"pith:WZEEDQYE","schema_version":"1.0","canonical_sha256":"b64841c304e0bcb37a12f12f066b04da7507a10fb1f3936c0a29d7d474a365f8","source":{"kind":"arxiv","id":"2509.20641","version":2},"attestation_state":"computed","paper":{"title":"Investigating Modality Contribution in Audio LLMs for Music","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"cs.LG","authors_text":"Giovana Morais, Magdalena Fuentes","submitted_at":"2025-09-25T00:56:35Z","abstract_excerpt":"Audio Large Language Models (Audio LLMs) enable human-like conversation about music, yet it is unclear if they are truly listening to the audio or just using textual reasoning, as recent benchmarks suggest. This paper investigates this issue by quantifying the contribution of each modality to a model's output. We adapt the MM-SHAP framework, a performance-agnostic score based on Shapley values that quantifies the relative contribution of each modality to a model's prediction. We evaluate two models on the MuChoMusic benchmark and find that the model with higher accuracy relies more on text to "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2509.20641","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-25T00:56:35Z","cross_cats_sorted":["cs.SD"],"title_canon_sha256":"cc8e22324138315582279f69485da4abfd80d00081ed380ea7d4f148c29bf17a","abstract_canon_sha256":"77bf7d765be5c8a63fff978b7baef8e07541ee5854d5b1189e23b7fa153e4959"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:35.526859Z","signature_b64":"LoUzrrFaMXrWwX2ZdETFf8zlY7At4QKSbF0V/3mD9HRbL8JBKbkaLXzWwEjUZX9B0GmGXv6YqauOUwqo2uOdCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b64841c304e0bcb37a12f12f066b04da7507a10fb1f3936c0a29d7d474a365f8","last_reissued_at":"2026-05-20T00:01:35.526024Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:35.526024Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Investigating Modality Contribution in Audio LLMs for Music","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"cs.LG","authors_text":"Giovana Morais, Magdalena Fuentes","submitted_at":"2025-09-25T00:56:35Z","abstract_excerpt":"Audio Large Language Models (Audio LLMs) enable human-like conversation about music, yet it is unclear if they are truly listening to the audio or just using textual reasoning, as recent benchmarks suggest. This paper investigates this issue by quantifying the contribution of each modality to a model's output. We adapt the MM-SHAP framework, a performance-agnostic score based on Shapley values that quantifies the relative contribution of each modality to a model's prediction. We evaluate two models on the MuChoMusic benchmark and find that the model with higher accuracy relies more on text to "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.20641","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.20641/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.20641","created_at":"2026-05-20T00:01:35.526169+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.20641v2","created_at":"2026-05-20T00:01:35.526169+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.20641","created_at":"2026-05-20T00:01:35.526169+00:00"},{"alias_kind":"pith_short_12","alias_value":"WZEEDQYE4C6L","created_at":"2026-05-20T00:01:35.526169+00:00"},{"alias_kind":"pith_short_16","alias_value":"WZEEDQYE4C6LG6QS","created_at":"2026-05-20T00:01:35.526169+00:00"},{"alias_kind":"pith_short_8","alias_value":"WZEEDQYE","created_at":"2026-05-20T00:01:35.526169+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2509.20641","citing_title":"Investigating Modality Contribution in Audio LLMs for Music","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20266","citing_title":"A Survey of Large Audio Language Models: Generalization, Trustworthiness, and Outlook","ref_index":142,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J","json":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J.json","graph_json":"https://pith.science/api/pith-number/WZEEDQYE4C6LG6QS6EXQM2YE3J/graph.json","events_json":"https://pith.science/api/pith-number/WZEEDQYE4C6LG6QS6EXQM2YE3J/events.json","paper":"https://pith.science/paper/WZEEDQYE"},"agent_actions":{"view_html":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J","download_json":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J.json","view_paper":"https://pith.science/paper/WZEEDQYE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.20641&json=true","fetch_graph":"https://pith.science/api/pith-number/WZEEDQYE4C6LG6QS6EXQM2YE3J/graph.json","fetch_events":"https://pith.science/api/pith-number/WZEEDQYE4C6LG6QS6EXQM2YE3J/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J/action/storage_attestation","attest_author":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J/action/author_attestation","sign_citation":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J/action/citation_signature","submit_replication":"https://pith.science/pith/WZEEDQYE4C6LG6QS6EXQM2YE3J/action/replication_record"}},"created_at":"2026-05-20T00:01:35.526169+00:00","updated_at":"2026-05-20T00:01:35.526169+00:00"}