{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:A6KF5IVYSAPCLM546R3RS2ZRCM","short_pith_number":"pith:A6KF5IVY","schema_version":"1.0","canonical_sha256":"07945ea2b8901e25b3bcf477196b31131a03ff18b6cf6a6046f64afcf1871783","source":{"kind":"arxiv","id":"2510.06048","version":4},"attestation_state":"computed","paper":{"title":"BLISS: A Lightweight Bilevel Influence Scoring Method for Data Selection in Language Model Pretraining","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Huixia Wang, Jie Hao, Jie Xu, Mingrui Liu, Rui Yu, Wei Zhang","submitted_at":"2025-10-07T15:42:33Z","abstract_excerpt":"Effective data selection is essential for pretraining large language models (LLMs), enhancing efficiency and improving generalization to downstream tasks. However, existing approaches often require leveraging external pretrained models, making it difficult to disentangle the effects of data selection from those of the external pretrained models. In addition, they often overlook the long-term impact of selected data if the model is trained to convergence, primarily due to the prohibitive cost of full-scale LLM pretraining. In this paper, we introduce BLISS (\\textbf{B}ileve\\textbf{L} \\textbf{I}n"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.06048","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-10-07T15:42:33Z","cross_cats_sorted":[],"title_canon_sha256":"9408559d7bd3eac75c4bef511f997ef1996f1eb69c1c25fbd5b4fe07c32e90ba","abstract_canon_sha256":"d966dbe7bb0ddaeca0f3a6f75f0f119e43cc5854dbad0bd692e844cf57bc24b4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:09.788108Z","signature_b64":"YBXBh99GXCaJi4+IMu3G2iNBjKhJDEjf2E1SkJNz7x05Vm16ipkKIAvYx4rSIwWqsm5scQfSwcdfxh5N8oSLBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"07945ea2b8901e25b3bcf477196b31131a03ff18b6cf6a6046f64afcf1871783","last_reissued_at":"2026-06-02T02:04:09.787646Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:09.787646Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"BLISS: A Lightweight Bilevel Influence Scoring Method for Data Selection in Language Model Pretraining","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Huixia Wang, Jie Hao, Jie Xu, Mingrui Liu, Rui Yu, Wei Zhang","submitted_at":"2025-10-07T15:42:33Z","abstract_excerpt":"Effective data selection is essential for pretraining large language models (LLMs), enhancing efficiency and improving generalization to downstream tasks. However, existing approaches often require leveraging external pretrained models, making it difficult to disentangle the effects of data selection from those of the external pretrained models. In addition, they often overlook the long-term impact of selected data if the model is trained to convergence, primarily due to the prohibitive cost of full-scale LLM pretraining. In this paper, we introduce BLISS (\\textbf{B}ileve\\textbf{L} \\textbf{I}n"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.06048","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.06048/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.06048","created_at":"2026-06-02T02:04:09.787716+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.06048v4","created_at":"2026-06-02T02:04:09.787716+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.06048","created_at":"2026-06-02T02:04:09.787716+00:00"},{"alias_kind":"pith_short_12","alias_value":"A6KF5IVYSAPC","created_at":"2026-06-02T02:04:09.787716+00:00"},{"alias_kind":"pith_short_16","alias_value":"A6KF5IVYSAPCLM54","created_at":"2026-06-02T02:04:09.787716+00:00"},{"alias_kind":"pith_short_8","alias_value":"A6KF5IVY","created_at":"2026-06-02T02:04:09.787716+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.09404","citing_title":"Let the Target Select for Itself: Data Selection via Target-Aligned Paths","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM","json":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM.json","graph_json":"https://pith.science/api/pith-number/A6KF5IVYSAPCLM546R3RS2ZRCM/graph.json","events_json":"https://pith.science/api/pith-number/A6KF5IVYSAPCLM546R3RS2ZRCM/events.json","paper":"https://pith.science/paper/A6KF5IVY"},"agent_actions":{"view_html":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM","download_json":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM.json","view_paper":"https://pith.science/paper/A6KF5IVY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.06048&json=true","fetch_graph":"https://pith.science/api/pith-number/A6KF5IVYSAPCLM546R3RS2ZRCM/graph.json","fetch_events":"https://pith.science/api/pith-number/A6KF5IVYSAPCLM546R3RS2ZRCM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM/action/storage_attestation","attest_author":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM/action/author_attestation","sign_citation":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM/action/citation_signature","submit_replication":"https://pith.science/pith/A6KF5IVYSAPCLM546R3RS2ZRCM/action/replication_record"}},"created_at":"2026-06-02T02:04:09.787716+00:00","updated_at":"2026-06-02T02:04:09.787716+00:00"}