{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:SUMCLBSWVGNSWMC2LSX6MR36RB","short_pith_number":"pith:SUMCLBSW","schema_version":"1.0","canonical_sha256":"9518258656a99b2b305a5cafe6477e886d9d3b23391138fa7346240370a9e392","source":{"kind":"arxiv","id":"2606.25391","version":1},"attestation_state":"computed","paper":{"title":"From Sounds to Scenes: A Benchmark for Evaluating Context-Aware Auditory Scene Understanding in Large Audio Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.MM"],"primary_cat":"cs.SD","authors_text":"Amir M. Rahmani, Henry Peng Zou, Hoang H Nguyen, Honghui Xu, Kazi Shaharair Sharif, Pengfei Zhang, Pinxin Liu, Wenjun Huang, Yutong Song","submitted_at":"2026-06-24T04:42:57Z","abstract_excerpt":"Recent Large Audio Language Models (LALMs) have achieved remarkable progress in audio perceptual tasks across individual acoustic layers, including speech, sound, and music. However, existing benchmarks predominantly evaluate these layers in isolation, overlooking the complex contextual relationships that arise when multiple acoustic sources co-occur in real-world auditory scenes. Real-world auditory interpretation requires Context-Aware Auditory Scene Understanding (CASU): the ability to comprehend the holistic scene by integrating sound layers. To evaluate this capability, we introduce the C"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.25391","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SD","submitted_at":"2026-06-24T04:42:57Z","cross_cats_sorted":["cs.AI","cs.MM"],"title_canon_sha256":"7f3968acd718c472d10d43b2ba94be847c14fc5fdd5234849b08a803a04285b4","abstract_canon_sha256":"c73921d72bb9af4117c86414dfaa5ed4ef19992dbf2e30662067f71288145649"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-25T01:18:04.099053Z","signature_b64":"87ly0LOvMBw6VIU+TWItGDvMesm5rYIiS+0zzRnK7eSlj5nreHeVX3diWP7/PqyzcM++H8+9APYV0U6P4OerBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9518258656a99b2b305a5cafe6477e886d9d3b23391138fa7346240370a9e392","last_reissued_at":"2026-06-25T01:18:04.098592Z","signature_status":"signed_v1","first_computed_at":"2026-06-25T01:18:04.098592Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"From Sounds to Scenes: A Benchmark for Evaluating Context-Aware Auditory Scene Understanding in Large Audio Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.MM"],"primary_cat":"cs.SD","authors_text":"Amir M. Rahmani, Henry Peng Zou, Hoang H Nguyen, Honghui Xu, Kazi Shaharair Sharif, Pengfei Zhang, Pinxin Liu, Wenjun Huang, Yutong Song","submitted_at":"2026-06-24T04:42:57Z","abstract_excerpt":"Recent Large Audio Language Models (LALMs) have achieved remarkable progress in audio perceptual tasks across individual acoustic layers, including speech, sound, and music. However, existing benchmarks predominantly evaluate these layers in isolation, overlooking the complex contextual relationships that arise when multiple acoustic sources co-occur in real-world auditory scenes. Real-world auditory interpretation requires Context-Aware Auditory Scene Understanding (CASU): the ability to comprehend the holistic scene by integrating sound layers. To evaluate this capability, we introduce the C"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.25391","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.25391/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.25391","created_at":"2026-06-25T01:18:04.098665+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.25391v1","created_at":"2026-06-25T01:18:04.098665+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.25391","created_at":"2026-06-25T01:18:04.098665+00:00"},{"alias_kind":"pith_short_12","alias_value":"SUMCLBSWVGNS","created_at":"2026-06-25T01:18:04.098665+00:00"},{"alias_kind":"pith_short_16","alias_value":"SUMCLBSWVGNSWMC2","created_at":"2026-06-25T01:18:04.098665+00:00"},{"alias_kind":"pith_short_8","alias_value":"SUMCLBSW","created_at":"2026-06-25T01:18:04.098665+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB","json":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB.json","graph_json":"https://pith.science/api/pith-number/SUMCLBSWVGNSWMC2LSX6MR36RB/graph.json","events_json":"https://pith.science/api/pith-number/SUMCLBSWVGNSWMC2LSX6MR36RB/events.json","paper":"https://pith.science/paper/SUMCLBSW"},"agent_actions":{"view_html":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB","download_json":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB.json","view_paper":"https://pith.science/paper/SUMCLBSW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.25391&json=true","fetch_graph":"https://pith.science/api/pith-number/SUMCLBSWVGNSWMC2LSX6MR36RB/graph.json","fetch_events":"https://pith.science/api/pith-number/SUMCLBSWVGNSWMC2LSX6MR36RB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB/action/storage_attestation","attest_author":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB/action/author_attestation","sign_citation":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB/action/citation_signature","submit_replication":"https://pith.science/pith/SUMCLBSWVGNSWMC2LSX6MR36RB/action/replication_record"}},"created_at":"2026-06-25T01:18:04.098665+00:00","updated_at":"2026-06-25T01:18:04.098665+00:00"}