{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:VRWVXDFRSMKWQCWMRQLUZWTSU4","short_pith_number":"pith:VRWVXDFR","schema_version":"1.0","canonical_sha256":"ac6d5b8cb19315680acc8c174cda72a70a58b2c02f72580d9e59778269211cc0","source":{"kind":"arxiv","id":"2606.20532","version":1},"attestation_state":"computed","paper":{"title":"How Do Instructions Shape Speech? Cross-Attention Attribution for Style-Captioned Text-to-Speech","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Akshat Mandloi, Apoorv Singh, Hamees Sayed, Nityanand Mathur, Sameer Khurana, Sudarshan Kamath, Wasim Madha","submitted_at":"2026-06-18T17:47:32Z","abstract_excerpt":"Style-captioned text-to-speech systems use natural language to control voice characteristics, but how individual words influence acoustic output remains unclear. Understanding this is critical for diagnosing failure modes and improving controllability in expressive TTS. We propose cross-attention attribution for speech diffusion models, adapting the DAAM framework to the speech domain for the first time, and apply it to CapSpeech-TTS. Our method extracts per-token heatmaps across 25 layers and 24 ODE steps. We analyze 3,600 (style caption, text transcript) combinations comprising 120 style cap"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.20532","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-18T17:47:32Z","cross_cats_sorted":[],"title_canon_sha256":"ff864c6a3e68704029106e20da9dc65c3323cd33553379551c14c41388e87627","abstract_canon_sha256":"27b500af2e635c6ac43da7e89ea97b3d23a6973eb692d9ac2d90ca331922d6de"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:13:14.673281Z","signature_b64":"q53vlGoSh277jDaBhF0e9ckGEOcTTf0aO/2OLESNIPJr3J1zVQG0hk2ajfSb6ydBIUxFBWNQ9KzdlUuwcL8DAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ac6d5b8cb19315680acc8c174cda72a70a58b2c02f72580d9e59778269211cc0","last_reissued_at":"2026-06-19T16:13:14.672916Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:13:14.672916Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"How Do Instructions Shape Speech? Cross-Attention Attribution for Style-Captioned Text-to-Speech","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Akshat Mandloi, Apoorv Singh, Hamees Sayed, Nityanand Mathur, Sameer Khurana, Sudarshan Kamath, Wasim Madha","submitted_at":"2026-06-18T17:47:32Z","abstract_excerpt":"Style-captioned text-to-speech systems use natural language to control voice characteristics, but how individual words influence acoustic output remains unclear. Understanding this is critical for diagnosing failure modes and improving controllability in expressive TTS. We propose cross-attention attribution for speech diffusion models, adapting the DAAM framework to the speech domain for the first time, and apply it to CapSpeech-TTS. Our method extracts per-token heatmaps across 25 layers and 24 ODE steps. We analyze 3,600 (style caption, text transcript) combinations comprising 120 style cap"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.20532","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.20532/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.20532","created_at":"2026-06-19T16:13:14.672980+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.20532v1","created_at":"2026-06-19T16:13:14.672980+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.20532","created_at":"2026-06-19T16:13:14.672980+00:00"},{"alias_kind":"pith_short_12","alias_value":"VRWVXDFRSMKW","created_at":"2026-06-19T16:13:14.672980+00:00"},{"alias_kind":"pith_short_16","alias_value":"VRWVXDFRSMKWQCWM","created_at":"2026-06-19T16:13:14.672980+00:00"},{"alias_kind":"pith_short_8","alias_value":"VRWVXDFR","created_at":"2026-06-19T16:13:14.672980+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4","json":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4.json","graph_json":"https://pith.science/api/pith-number/VRWVXDFRSMKWQCWMRQLUZWTSU4/graph.json","events_json":"https://pith.science/api/pith-number/VRWVXDFRSMKWQCWMRQLUZWTSU4/events.json","paper":"https://pith.science/paper/VRWVXDFR"},"agent_actions":{"view_html":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4","download_json":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4.json","view_paper":"https://pith.science/paper/VRWVXDFR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.20532&json=true","fetch_graph":"https://pith.science/api/pith-number/VRWVXDFRSMKWQCWMRQLUZWTSU4/graph.json","fetch_events":"https://pith.science/api/pith-number/VRWVXDFRSMKWQCWMRQLUZWTSU4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4/action/storage_attestation","attest_author":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4/action/author_attestation","sign_citation":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4/action/citation_signature","submit_replication":"https://pith.science/pith/VRWVXDFRSMKWQCWMRQLUZWTSU4/action/replication_record"}},"created_at":"2026-06-19T16:13:14.672980+00:00","updated_at":"2026-06-19T16:13:14.672980+00:00"}