{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:IKJF3F3TMNEDXIPMTCLLTXM5AP","short_pith_number":"pith:IKJF3F3T","schema_version":"1.0","canonical_sha256":"42925d977363483ba1ec9896b9dd9d03c96fae180fd3414ba0e2adf6c0db36ce","source":{"kind":"arxiv","id":"2602.12966","version":2},"attestation_state":"computed","paper":{"title":"ProbeLLM: Automating Principled Diagnosis of LLM Failures","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.SE"],"primary_cat":"cs.CL","authors_text":"Kehan Guo, Pin-Yu Chen, Stefan Feuerriegel, Xiangliang Zhang, Xiangqi Wang, Yuchen Ma, Yue Huang, Yuexing Hao, Yu Jiang, Yujun Zhou, Zhengzhe Jiang","submitted_at":"2026-02-13T14:33:13Z","abstract_excerpt":"Understanding how and why large language models (LLMs) fail is becoming a central challenge as models rapidly evolve and static evaluations fall behind. While automated probing has been enabled by dynamic test generation, existing approaches often discover isolated failure cases, lack principled control over exploration, and provide limited insight into the underlying structure of model weaknesses. We propose ProbeLLM, a benchmark-agnostic automated probing framework that elevates weakness discovery from individual failures to structured failure modes. ProbeLLM formulates probing as a hierarch"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.12966","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-13T14:33:13Z","cross_cats_sorted":["cs.SE"],"title_canon_sha256":"dbb5ff517cf1ef7003913926fcdc5d35eab1a4d981532e6942f2be104be2d5cc","abstract_canon_sha256":"8ff53c4b2a37504f08ccbfc8ede681c394ab7d59b3cfe089fe991ad63202a55d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-10T01:09:56.027441Z","signature_b64":"P2FsRl70g0YOVh/v3ggZwQ+tBiTgcJBNeCG6sTCsthD+ZRzsQQr75iFWdlDmGIVkEo/N9CEEeFu4vfa4uGBJBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"42925d977363483ba1ec9896b9dd9d03c96fae180fd3414ba0e2adf6c0db36ce","last_reissued_at":"2026-06-10T01:09:56.026259Z","signature_status":"signed_v1","first_computed_at":"2026-06-10T01:09:56.026259Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"ProbeLLM: Automating Principled Diagnosis of LLM Failures","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.SE"],"primary_cat":"cs.CL","authors_text":"Kehan Guo, Pin-Yu Chen, Stefan Feuerriegel, Xiangliang Zhang, Xiangqi Wang, Yuchen Ma, Yue Huang, Yuexing Hao, Yu Jiang, Yujun Zhou, Zhengzhe Jiang","submitted_at":"2026-02-13T14:33:13Z","abstract_excerpt":"Understanding how and why large language models (LLMs) fail is becoming a central challenge as models rapidly evolve and static evaluations fall behind. While automated probing has been enabled by dynamic test generation, existing approaches often discover isolated failure cases, lack principled control over exploration, and provide limited insight into the underlying structure of model weaknesses. We propose ProbeLLM, a benchmark-agnostic automated probing framework that elevates weakness discovery from individual failures to structured failure modes. ProbeLLM formulates probing as a hierarch"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.12966","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.12966/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.12966","created_at":"2026-06-10T01:09:56.026442+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.12966v2","created_at":"2026-06-10T01:09:56.026442+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.12966","created_at":"2026-06-10T01:09:56.026442+00:00"},{"alias_kind":"pith_short_12","alias_value":"IKJF3F3TMNED","created_at":"2026-06-10T01:09:56.026442+00:00"},{"alias_kind":"pith_short_16","alias_value":"IKJF3F3TMNEDXIPM","created_at":"2026-06-10T01:09:56.026442+00:00"},{"alias_kind":"pith_short_8","alias_value":"IKJF3F3T","created_at":"2026-06-10T01:09:56.026442+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.05678","citing_title":"Chain of Risk: Safety Failures in Large Reasoning Models and Mitigation via Adaptive Multi-Principle Steering","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07655","citing_title":"Guardian-as-an-Advisor: Advancing Next-Generation Guardian Models for Trustworthy LLMs","ref_index":32,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP","json":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP.json","graph_json":"https://pith.science/api/pith-number/IKJF3F3TMNEDXIPMTCLLTXM5AP/graph.json","events_json":"https://pith.science/api/pith-number/IKJF3F3TMNEDXIPMTCLLTXM5AP/events.json","paper":"https://pith.science/paper/IKJF3F3T"},"agent_actions":{"view_html":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP","download_json":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP.json","view_paper":"https://pith.science/paper/IKJF3F3T","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.12966&json=true","fetch_graph":"https://pith.science/api/pith-number/IKJF3F3TMNEDXIPMTCLLTXM5AP/graph.json","fetch_events":"https://pith.science/api/pith-number/IKJF3F3TMNEDXIPMTCLLTXM5AP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP/action/storage_attestation","attest_author":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP/action/author_attestation","sign_citation":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP/action/citation_signature","submit_replication":"https://pith.science/pith/IKJF3F3TMNEDXIPMTCLLTXM5AP/action/replication_record"}},"created_at":"2026-06-10T01:09:56.026442+00:00","updated_at":"2026-06-10T01:09:56.026442+00:00"}