{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:7QRGQQQJ3TSQ3V2JK3CI72UM4K","short_pith_number":"pith:7QRGQQQJ","canonical_record":{"source":{"id":"2602.02320","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-02T16:49:19Z","cross_cats_sorted":["cs.AI","q-bio.BM"],"title_canon_sha256":"50607597811c6c08878a9e94dbb41951ef704c5a6aded7042ce0e79d12aba4f2","abstract_canon_sha256":"205f61779229925a96580ce3e3b266faa995dc72d5dd1a3453d755511b3b74ae"},"schema_version":"1.0"},"canonical_sha256":"fc22684209dce50dd74956c48fea8ce2af451c5c6476a80c3c7d183b66474880","source":{"kind":"arxiv","id":"2602.02320","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.02320","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"arxiv_version","alias_value":"2602.02320v4","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.02320","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"pith_short_12","alias_value":"7QRGQQQJ3TSQ","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"pith_short_16","alias_value":"7QRGQQQJ3TSQ3V2J","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"pith_short_8","alias_value":"7QRGQQQJ","created_at":"2026-06-30T02:18:06Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:7QRGQQQJ3TSQ3V2JK3CI72UM4K","target":"record","payload":{"canonical_record":{"source":{"id":"2602.02320","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-02T16:49:19Z","cross_cats_sorted":["cs.AI","q-bio.BM"],"title_canon_sha256":"50607597811c6c08878a9e94dbb41951ef704c5a6aded7042ce0e79d12aba4f2","abstract_canon_sha256":"205f61779229925a96580ce3e3b266faa995dc72d5dd1a3453d755511b3b74ae"},"schema_version":"1.0"},"canonical_sha256":"fc22684209dce50dd74956c48fea8ce2af451c5c6476a80c3c7d183b66474880","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:18:06.651957Z","signature_b64":"zOt3j9OajnmMM3l73Vt3EzQaXNayXfx0gtHVSCukGMXb+kH4SpUcRAma3KXtNpQEew36RPtCcn3UEPqbcEejBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fc22684209dce50dd74956c48fea8ce2af451c5c6476a80c3c7d183b66474880","last_reissued_at":"2026-06-30T02:18:06.651175Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:18:06.651175Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.02320","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-30T02:18:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"JxElez6J/H/DlwVhiIoQtPo32lXpVJag6r8eWdPx45CwOTIRP9izYhN10A3vsUHG8A4xUHm3ETR4bExYb7KnBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T17:20:18.613829Z"},"content_sha256":"66f6e6ce0c42b60e9ac5a2a25df372fe4494dd1217c5807e49c48c1171281fcf","schema_version":"1.0","event_id":"sha256:66f6e6ce0c42b60e9ac5a2a25df372fe4494dd1217c5807e49c48c1171281fcf"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:7QRGQQQJ3TSQ3V2JK3CI72UM4K","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"A Large-Scale Dataset for Molecular Structure-Language Description via a Rule-Regularized Method","license":"http://creativecommons.org/licenses/by/4.0/","headline":"An automated framework parses IUPAC names into structural metadata to guide LLMs in creating a 163000-pair molecule-description dataset at 98.6 percent precision.","cross_cats":["cs.AI","q-bio.BM"],"primary_cat":"cs.CL","authors_text":"Feiyang Cai, Feng Luo, Gang Li, Guijuan He, Jingjing Wang, Joshua Luo, Ling Liu, Srikanth Pilla, Tianyu Zhu, Yi Hu","submitted_at":"2026-02-02T16:49:19Z","abstract_excerpt":"Molecular function is largely determined by structure. Accurately aligning molecular structure with natural language is therefore essential for enabling large language models (LLMs) to reason about downstream chemical tasks. However, the substantial cost of human annotation makes it infeasible to construct large-scale, high-quality datasets of structure-grounded descriptions. In this work, we propose a fully automated annotation framework for generating precise molecular descriptions that preserve complete structural details at scale. Our approach builds upon and extends a rule-based chemical "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Using this framework, we curate a large-scale dataset of approximately 163k molecule--description pairs. A rigorous validation protocol combining LLM-based and expert human evaluation on a subset of 2,000 molecules demonstrates a high description precision of 98.6%.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The extended rule-based parser correctly extracts complete structural details from every IUPAC name into XML metadata, and the subsequent LLM generations faithfully reflect those details without introducing structural errors or hallucinations.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"An automated rule-based parser plus LLM pipeline creates a 163k-pair molecular structure-language dataset validated at 98.6% precision on a 2,000-sample subset.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"An automated framework parses IUPAC names into structural metadata to guide LLMs in creating a 163000-pair molecule-description dataset at 98.6 percent precision.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"5fddc3440994d75d18e998523ed18f9cc0aa2a765396dfa087964381c3fc1471"},"source":{"id":"2602.02320","kind":"arxiv","version":4},"verdict":{"id":"16e99a32-6b18-406b-80e4-852cf7b33486","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T08:08:05.116185Z","strongest_claim":"Using this framework, we curate a large-scale dataset of approximately 163k molecule--description pairs. A rigorous validation protocol combining LLM-based and expert human evaluation on a subset of 2,000 molecules demonstrates a high description precision of 98.6%.","one_line_summary":"An automated rule-based parser plus LLM pipeline creates a 163k-pair molecular structure-language dataset validated at 98.6% precision on a 2,000-sample subset.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The extended rule-based parser correctly extracts complete structural details from every IUPAC name into XML metadata, and the subsequent LLM generations faithfully reflect those details without introducing structural errors or hallucinations.","pith_extraction_headline":"An automated framework parses IUPAC names into structural metadata to guide LLMs in creating a 163000-pair molecule-description dataset at 98.6 percent precision."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.02320/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"16e99a32-6b18-406b-80e4-852cf7b33486"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-30T02:18:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"KFORTl+fdZM451VGvtAieeXBDhQFyo2dJj5XJWVhTFzeqMiXUGfL+OqHFvMoKAyoGy2YDJ5eGQ+r4sXw3vnRDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T17:20:18.614299Z"},"content_sha256":"a5c72f4442049ffa1f5823f6356faf3aa0da5514622764e69eeaa615489defa3","schema_version":"1.0","event_id":"sha256:a5c72f4442049ffa1f5823f6356faf3aa0da5514622764e69eeaa615489defa3"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7QRGQQQJ3TSQ3V2JK3CI72UM4K/bundle.json","state_url":"https://pith.science/pith/7QRGQQQJ3TSQ3V2JK3CI72UM4K/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7QRGQQQJ3TSQ3V2JK3CI72UM4K/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-02T17:20:18Z","links":{"resolver":"https://pith.science/pith/7QRGQQQJ3TSQ3V2JK3CI72UM4K","bundle":"https://pith.science/pith/7QRGQQQJ3TSQ3V2JK3CI72UM4K/bundle.json","state":"https://pith.science/pith/7QRGQQQJ3TSQ3V2JK3CI72UM4K/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7QRGQQQJ3TSQ3V2JK3CI72UM4K/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:7QRGQQQJ3TSQ3V2JK3CI72UM4K","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"205f61779229925a96580ce3e3b266faa995dc72d5dd1a3453d755511b3b74ae","cross_cats_sorted":["cs.AI","q-bio.BM"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-02T16:49:19Z","title_canon_sha256":"50607597811c6c08878a9e94dbb41951ef704c5a6aded7042ce0e79d12aba4f2"},"schema_version":"1.0","source":{"id":"2602.02320","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.02320","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"arxiv_version","alias_value":"2602.02320v4","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.02320","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"pith_short_12","alias_value":"7QRGQQQJ3TSQ","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"pith_short_16","alias_value":"7QRGQQQJ3TSQ3V2J","created_at":"2026-06-30T02:18:06Z"},{"alias_kind":"pith_short_8","alias_value":"7QRGQQQJ","created_at":"2026-06-30T02:18:06Z"}],"graph_snapshots":[{"event_id":"sha256:a5c72f4442049ffa1f5823f6356faf3aa0da5514622764e69eeaa615489defa3","target":"graph","created_at":"2026-06-30T02:18:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Using this framework, we curate a large-scale dataset of approximately 163k molecule--description pairs. A rigorous validation protocol combining LLM-based and expert human evaluation on a subset of 2,000 molecules demonstrates a high description precision of 98.6%."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The extended rule-based parser correctly extracts complete structural details from every IUPAC name into XML metadata, and the subsequent LLM generations faithfully reflect those details without introducing structural errors or hallucinations."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"An automated rule-based parser plus LLM pipeline creates a 163k-pair molecular structure-language dataset validated at 98.6% precision on a 2,000-sample subset."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"An automated framework parses IUPAC names into structural metadata to guide LLMs in creating a 163000-pair molecule-description dataset at 98.6 percent precision."}],"snapshot_sha256":"5fddc3440994d75d18e998523ed18f9cc0aa2a765396dfa087964381c3fc1471"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.02320/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Molecular function is largely determined by structure. Accurately aligning molecular structure with natural language is therefore essential for enabling large language models (LLMs) to reason about downstream chemical tasks. However, the substantial cost of human annotation makes it infeasible to construct large-scale, high-quality datasets of structure-grounded descriptions. In this work, we propose a fully automated annotation framework for generating precise molecular descriptions that preserve complete structural details at scale. Our approach builds upon and extends a rule-based chemical ","authors_text":"Feiyang Cai, Feng Luo, Gang Li, Guijuan He, Jingjing Wang, Joshua Luo, Ling Liu, Srikanth Pilla, Tianyu Zhu, Yi Hu","cross_cats":["cs.AI","q-bio.BM"],"headline":"An automated framework parses IUPAC names into structural metadata to guide LLMs in creating a 163000-pair molecule-description dataset at 98.6 percent precision.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-02T16:49:19Z","title":"A Large-Scale Dataset for Molecular Structure-Language Description via a Rule-Regularized Method"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.02320","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T08:08:05.116185Z","id":"16e99a32-6b18-406b-80e4-852cf7b33486","model_set":{"reader":"grok-4.3"},"one_line_summary":"An automated rule-based parser plus LLM pipeline creates a 163k-pair molecular structure-language dataset validated at 98.6% precision on a 2,000-sample subset.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"An automated framework parses IUPAC names into structural metadata to guide LLMs in creating a 163000-pair molecule-description dataset at 98.6 percent precision.","strongest_claim":"Using this framework, we curate a large-scale dataset of approximately 163k molecule--description pairs. A rigorous validation protocol combining LLM-based and expert human evaluation on a subset of 2,000 molecules demonstrates a high description precision of 98.6%.","weakest_assumption":"The extended rule-based parser correctly extracts complete structural details from every IUPAC name into XML metadata, and the subsequent LLM generations faithfully reflect those details without introducing structural errors or hallucinations."}},"verdict_id":"16e99a32-6b18-406b-80e4-852cf7b33486"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:66f6e6ce0c42b60e9ac5a2a25df372fe4494dd1217c5807e49c48c1171281fcf","target":"record","created_at":"2026-06-30T02:18:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"205f61779229925a96580ce3e3b266faa995dc72d5dd1a3453d755511b3b74ae","cross_cats_sorted":["cs.AI","q-bio.BM"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-02T16:49:19Z","title_canon_sha256":"50607597811c6c08878a9e94dbb41951ef704c5a6aded7042ce0e79d12aba4f2"},"schema_version":"1.0","source":{"id":"2602.02320","kind":"arxiv","version":4}},"canonical_sha256":"fc22684209dce50dd74956c48fea8ce2af451c5c6476a80c3c7d183b66474880","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fc22684209dce50dd74956c48fea8ce2af451c5c6476a80c3c7d183b66474880","first_computed_at":"2026-06-30T02:18:06.651175Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-30T02:18:06.651175Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"zOt3j9OajnmMM3l73Vt3EzQaXNayXfx0gtHVSCukGMXb+kH4SpUcRAma3KXtNpQEew36RPtCcn3UEPqbcEejBw==","signature_status":"signed_v1","signed_at":"2026-06-30T02:18:06.651957Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.02320","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:66f6e6ce0c42b60e9ac5a2a25df372fe4494dd1217c5807e49c48c1171281fcf","sha256:a5c72f4442049ffa1f5823f6356faf3aa0da5514622764e69eeaa615489defa3"],"state_sha256":"6fe1c7a3e853bc08f4d306cd434cefd891bfb2e1c862f7991843c59755474f60"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RWFhYRoCIdvmPQT+0RQJYneoYMV6dcbJUVm2hwJTDAbHirWMgnVfNF+VpMMUiBvhavyhkAj65ggY+EpfLWTqAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-02T17:20:18.616512Z","bundle_sha256":"ae441387e386c27dabee71900cf1185ef6c5d97f7ec7db94288536d4fa9f35ea"}}