{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:3KCLQBMYJ6AMFGY4J2GJ676ZJD","short_pith_number":"pith:3KCLQBMY","schema_version":"1.0","canonical_sha256":"da84b805984f80c29b1c4e8c9f7fd948fee2f905392e586c9d29c1d5903bcfb4","source":{"kind":"arxiv","id":"2606.07929","version":1},"attestation_state":"computed","paper":{"title":"Stress-testing medical large language models reveals latent safety pathology beyond benchmark accuracy","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Linghua Yu, Xiaojun Wu, Yuan Shen","submitted_at":"2026-06-06T01:39:14Z","abstract_excerpt":"Large language models (LLMs) are entering clinical practice based on benchmark accuracy that may fail to detect safety-relevant failure modes. Here we present AI-MASLD, a stress-audit framework that adapts the logic of metabolic stress testing from hepatology to the evaluation of clinical LLMs. Using 240 clinical cases across six narrative perturbation probes, we subjected seven models to double-stress testing and quantified performance through three indices: metabolic index (MI), perturbation flip rate (PFR), and counterfactual fairness index (CFI). Under clean baseline conditions, all models"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.07929","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-06T01:39:14Z","cross_cats_sorted":[],"title_canon_sha256":"fb6b477441842e01ee1cdc191abfa7f1073329757ae20414c0fdfc2c992a5144","abstract_canon_sha256":"c80d69c98e980fe13798e6a04120911c4f97802557b6b7780b26295608c57de2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T01:04:55.697137Z","signature_b64":"S5i5gbBfDQ0lbItqIg60jhF1geRtJzqLcU89BYePMuB7EHPbQ7ExSByVNSbol0q0CPj+C1ddz2nATQMf59rNCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"da84b805984f80c29b1c4e8c9f7fd948fee2f905392e586c9d29c1d5903bcfb4","last_reissued_at":"2026-06-09T01:04:55.696739Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T01:04:55.696739Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Stress-testing medical large language models reveals latent safety pathology beyond benchmark accuracy","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Linghua Yu, Xiaojun Wu, Yuan Shen","submitted_at":"2026-06-06T01:39:14Z","abstract_excerpt":"Large language models (LLMs) are entering clinical practice based on benchmark accuracy that may fail to detect safety-relevant failure modes. Here we present AI-MASLD, a stress-audit framework that adapts the logic of metabolic stress testing from hepatology to the evaluation of clinical LLMs. Using 240 clinical cases across six narrative perturbation probes, we subjected seven models to double-stress testing and quantified performance through three indices: metabolic index (MI), perturbation flip rate (PFR), and counterfactual fairness index (CFI). Under clean baseline conditions, all models"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07929","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.07929/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.07929","created_at":"2026-06-09T01:04:55.696804+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.07929v1","created_at":"2026-06-09T01:04:55.696804+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07929","created_at":"2026-06-09T01:04:55.696804+00:00"},{"alias_kind":"pith_short_12","alias_value":"3KCLQBMYJ6AM","created_at":"2026-06-09T01:04:55.696804+00:00"},{"alias_kind":"pith_short_16","alias_value":"3KCLQBMYJ6AMFGY4","created_at":"2026-06-09T01:04:55.696804+00:00"},{"alias_kind":"pith_short_8","alias_value":"3KCLQBMY","created_at":"2026-06-09T01:04:55.696804+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD","json":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD.json","graph_json":"https://pith.science/api/pith-number/3KCLQBMYJ6AMFGY4J2GJ676ZJD/graph.json","events_json":"https://pith.science/api/pith-number/3KCLQBMYJ6AMFGY4J2GJ676ZJD/events.json","paper":"https://pith.science/paper/3KCLQBMY"},"agent_actions":{"view_html":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD","download_json":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD.json","view_paper":"https://pith.science/paper/3KCLQBMY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.07929&json=true","fetch_graph":"https://pith.science/api/pith-number/3KCLQBMYJ6AMFGY4J2GJ676ZJD/graph.json","fetch_events":"https://pith.science/api/pith-number/3KCLQBMYJ6AMFGY4J2GJ676ZJD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD/action/storage_attestation","attest_author":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD/action/author_attestation","sign_citation":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD/action/citation_signature","submit_replication":"https://pith.science/pith/3KCLQBMYJ6AMFGY4J2GJ676ZJD/action/replication_record"}},"created_at":"2026-06-09T01:04:55.696804+00:00","updated_at":"2026-06-09T01:04:55.696804+00:00"}