{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:L2VHQACIZP7GT2TXWP7FF5ABRZ","short_pith_number":"pith:L2VHQACI","canonical_record":{"source":{"id":"2406.20094","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2024-06-28T17:59:01Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"7ff707acefed87c3ecc5c9283dcc8d2a1c6be1b76f5e79bd4f5d5d552177ff6d","abstract_canon_sha256":"2c7f02e414d1271b02b929d305adaf70bc07f6b89eec240f428b7c3baac3ed37"},"schema_version":"1.0"},"canonical_sha256":"5eaa780048cbfe69ea77b3fe52f4018e4ccfb186efec30f7795f934cd756d5b8","source":{"kind":"arxiv","id":"2406.20094","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2406.20094","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2406.20094v3","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.20094","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"L2VHQACIZP7G","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"L2VHQACIZP7GT2TX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"L2VHQACI","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:L2VHQACIZP7GT2TXWP7FF5ABRZ","target":"record","payload":{"canonical_record":{"source":{"id":"2406.20094","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2024-06-28T17:59:01Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"7ff707acefed87c3ecc5c9283dcc8d2a1c6be1b76f5e79bd4f5d5d552177ff6d","abstract_canon_sha256":"2c7f02e414d1271b02b929d305adaf70bc07f6b89eec240f428b7c3baac3ed37"},"schema_version":"1.0"},"canonical_sha256":"5eaa780048cbfe69ea77b3fe52f4018e4ccfb186efec30f7795f934cd756d5b8","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.677274Z","signature_b64":"MEuukhOYJ3Aq0DO3U5V0J/9p8dqAQxRhHf1Col0XgsV8V+wWr2lZuSVZBqcf2nXI8AigAGJVpua4H0P7laS8BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5eaa780048cbfe69ea77b3fe52f4018e4ccfb186efec30f7795f934cd756d5b8","last_reissued_at":"2026-05-17T23:38:49.676680Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.676680Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2406.20094","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vIdj+QfWIzK6li/4/BrNk/xrWR6LiRflsP3Dnymjigt4Sq2Pv/ND4KC+LrARDiiWmm7tCWhsFO1ym60smSpBCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T02:12:56.248623Z"},"content_sha256":"9eba6ac6dfaa43efbf1f87e2f4fb5bdba02c5097a509b546fff62a7bdacbe00f","schema_version":"1.0","event_id":"sha256:9eba6ac6dfaa43efbf1f87e2f4fb5bdba02c5097a509b546fff62a7bdacbe00f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:L2VHQACIZP7GT2TXWP7FF5ABRZ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Scaling Synthetic Data Creation with 1,000,000,000 Personas","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A hub of one billion web-curated personas lets an LLM generate diverse synthetic data across math, instructions, knowledge texts, NPCs, and tools at scale.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Dian Yu, Dong Yu, Haitao Mi, Tao Ge, Xiaoyang Wang, Xin Chan","submitted_at":"2024-06-28T17:59:01Z","abstract_excerpt":"We propose a novel persona-driven data synthesis methodology that leverages various perspectives within a large language model (LLM) to create diverse synthetic data. To fully exploit this methodology at scale, we introduce Persona Hub -- a collection of 1 billion diverse personas automatically curated from web data. These 1 billion personas (~13% of the world's total population), acting as distributed carriers of world knowledge, can tap into almost every perspective encapsulated within the LLM, thereby facilitating the creation of diverse synthetic data at scale for various scenarios. By sho"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"These 1 billion personas (~13% of the world's total population), acting as distributed carriers of world knowledge, can tap into almost every perspective encapsulated within the LLM, thereby facilitating the creation of diverse synthetic data at scale for various scenarios.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That automatically curated web personas are sufficiently diverse, unbiased, and faithfully simulable by the LLM without introducing repetition or hallucinated perspectives that degrade data quality.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A curated set of one billion personas enables scalable, diverse synthetic data generation for LLM training across reasoning, instructions, knowledge, NPCs, and tools.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A hub of one billion web-curated personas lets an LLM generate diverse synthetic data across math, instructions, knowledge texts, NPCs, and tools at scale.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"6d4d700fdf7c88394be83ca0ef9bf75ad16019307c0c60de7394cd420b3b8ade"},"source":{"id":"2406.20094","kind":"arxiv","version":3},"verdict":{"id":"4abaef22-c6d5-45a3-b733-2fe6de6760a5","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T00:00:27.724368Z","strongest_claim":"These 1 billion personas (~13% of the world's total population), acting as distributed carriers of world knowledge, can tap into almost every perspective encapsulated within the LLM, thereby facilitating the creation of diverse synthetic data at scale for various scenarios.","one_line_summary":"A curated set of one billion personas enables scalable, diverse synthetic data generation for LLM training across reasoning, instructions, knowledge, NPCs, and tools.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That automatically curated web personas are sufficiently diverse, unbiased, and faithfully simulable by the LLM without introducing repetition or hallucinated perspectives that degrade data quality.","pith_extraction_headline":"A hub of one billion web-curated personas lets an LLM generate diverse synthetic data across math, instructions, knowledge texts, NPCs, and tools at scale."},"references":{"count":29,"sample":[{"doi":"","year":null,"title":"Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone","work_id":"feef9556-a016-493c-abd2-0c97a23a7ebf","ref_index":1,"cited_arxiv_id":"2404.14219","is_internal_anchor":true},{"doi":"","year":null,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":2,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":null,"title":"Coig-cqia: Quality is all you need for chinese instruction fine-tuning","work_id":"4d58f189-c89b-4190-b567-6fc2fba5e0b7","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"arXiv preprint arXiv:2401.02524 , year=","work_id":"7c3b201d-cc93-4fcb-bcb6-2583f29edbbf","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","work_id":"01b10587-025b-499d-8ba3-7a538d24c2d6","ref_index":5,"cited_arxiv_id":"2401.02954","is_internal_anchor":true}],"resolved_work":29,"snapshot_sha256":"f5adcbed9c271f68d69e260d12c3db9a3e38a5d9b74a51eb3b9dc8255144b34d","internal_anchors":12},"formal_canon":{"evidence_count":3,"snapshot_sha256":"3a102509548c9d31ff3da7862f6a216bcbc51b936ee73aca69f74f4950069157"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"4abaef22-c6d5-45a3-b733-2fe6de6760a5"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"KrGHE8MSx1Hm/MhwjU7UfwyYzXRI39AfwHfGO18OvQveisNU/iKScLsgUtRwI9m895lihXjaEBSxXRWEms+BCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T02:12:56.249289Z"},"content_sha256":"692479df18bd46946c3c0632fa429be86325fdac76c55d193e27c1dcb8d6749d","schema_version":"1.0","event_id":"sha256:692479df18bd46946c3c0632fa429be86325fdac76c55d193e27c1dcb8d6749d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/L2VHQACIZP7GT2TXWP7FF5ABRZ/bundle.json","state_url":"https://pith.science/pith/L2VHQACIZP7GT2TXWP7FF5ABRZ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/L2VHQACIZP7GT2TXWP7FF5ABRZ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-31T02:12:56Z","links":{"resolver":"https://pith.science/pith/L2VHQACIZP7GT2TXWP7FF5ABRZ","bundle":"https://pith.science/pith/L2VHQACIZP7GT2TXWP7FF5ABRZ/bundle.json","state":"https://pith.science/pith/L2VHQACIZP7GT2TXWP7FF5ABRZ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/L2VHQACIZP7GT2TXWP7FF5ABRZ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:L2VHQACIZP7GT2TXWP7FF5ABRZ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"2c7f02e414d1271b02b929d305adaf70bc07f6b89eec240f428b7c3baac3ed37","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2024-06-28T17:59:01Z","title_canon_sha256":"7ff707acefed87c3ecc5c9283dcc8d2a1c6be1b76f5e79bd4f5d5d552177ff6d"},"schema_version":"1.0","source":{"id":"2406.20094","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2406.20094","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2406.20094v3","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.20094","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"L2VHQACIZP7G","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"L2VHQACIZP7GT2TX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"L2VHQACI","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:692479df18bd46946c3c0632fa429be86325fdac76c55d193e27c1dcb8d6749d","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"These 1 billion personas (~13% of the world's total population), acting as distributed carriers of world knowledge, can tap into almost every perspective encapsulated within the LLM, thereby facilitating the creation of diverse synthetic data at scale for various scenarios."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That automatically curated web personas are sufficiently diverse, unbiased, and faithfully simulable by the LLM without introducing repetition or hallucinated perspectives that degrade data quality."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A curated set of one billion personas enables scalable, diverse synthetic data generation for LLM training across reasoning, instructions, knowledge, NPCs, and tools."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A hub of one billion web-curated personas lets an LLM generate diverse synthetic data across math, instructions, knowledge texts, NPCs, and tools at scale."}],"snapshot_sha256":"6d4d700fdf7c88394be83ca0ef9bf75ad16019307c0c60de7394cd420b3b8ade"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"3a102509548c9d31ff3da7862f6a216bcbc51b936ee73aca69f74f4950069157"},"paper":{"abstract_excerpt":"We propose a novel persona-driven data synthesis methodology that leverages various perspectives within a large language model (LLM) to create diverse synthetic data. To fully exploit this methodology at scale, we introduce Persona Hub -- a collection of 1 billion diverse personas automatically curated from web data. These 1 billion personas (~13% of the world's total population), acting as distributed carriers of world knowledge, can tap into almost every perspective encapsulated within the LLM, thereby facilitating the creation of diverse synthetic data at scale for various scenarios. By sho","authors_text":"Dian Yu, Dong Yu, Haitao Mi, Tao Ge, Xiaoyang Wang, Xin Chan","cross_cats":["cs.LG"],"headline":"A hub of one billion web-curated personas lets an LLM generate diverse synthetic data across math, instructions, knowledge texts, NPCs, and tools at scale.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2024-06-28T17:59:01Z","title":"Scaling Synthetic Data Creation with 1,000,000,000 Personas"},"references":{"count":29,"internal_anchors":12,"resolved_work":29,"sample":[{"cited_arxiv_id":"2404.14219","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone","work_id":"feef9556-a016-493c-abd2-0c97a23a7ebf","year":null},{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":2,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Coig-cqia: Quality is all you need for chinese instruction fine-tuning","work_id":"4d58f189-c89b-4190-b567-6fc2fba5e0b7","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"arXiv preprint arXiv:2401.02524 , year=","work_id":"7c3b201d-cc93-4fcb-bcb6-2583f29edbbf","year":null},{"cited_arxiv_id":"2401.02954","doi":"","is_internal_anchor":true,"ref_index":5,"title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","work_id":"01b10587-025b-499d-8ba3-7a538d24c2d6","year":null}],"snapshot_sha256":"f5adcbed9c271f68d69e260d12c3db9a3e38a5d9b74a51eb3b9dc8255144b34d"},"source":{"id":"2406.20094","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T00:00:27.724368Z","id":"4abaef22-c6d5-45a3-b733-2fe6de6760a5","model_set":{"reader":"grok-4.3"},"one_line_summary":"A curated set of one billion personas enables scalable, diverse synthetic data generation for LLM training across reasoning, instructions, knowledge, NPCs, and tools.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A hub of one billion web-curated personas lets an LLM generate diverse synthetic data across math, instructions, knowledge texts, NPCs, and tools at scale.","strongest_claim":"These 1 billion personas (~13% of the world's total population), acting as distributed carriers of world knowledge, can tap into almost every perspective encapsulated within the LLM, thereby facilitating the creation of diverse synthetic data at scale for various scenarios.","weakest_assumption":"That automatically curated web personas are sufficiently diverse, unbiased, and faithfully simulable by the LLM without introducing repetition or hallucinated perspectives that degrade data quality."}},"verdict_id":"4abaef22-c6d5-45a3-b733-2fe6de6760a5"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9eba6ac6dfaa43efbf1f87e2f4fb5bdba02c5097a509b546fff62a7bdacbe00f","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"2c7f02e414d1271b02b929d305adaf70bc07f6b89eec240f428b7c3baac3ed37","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2024-06-28T17:59:01Z","title_canon_sha256":"7ff707acefed87c3ecc5c9283dcc8d2a1c6be1b76f5e79bd4f5d5d552177ff6d"},"schema_version":"1.0","source":{"id":"2406.20094","kind":"arxiv","version":3}},"canonical_sha256":"5eaa780048cbfe69ea77b3fe52f4018e4ccfb186efec30f7795f934cd756d5b8","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5eaa780048cbfe69ea77b3fe52f4018e4ccfb186efec30f7795f934cd756d5b8","first_computed_at":"2026-05-17T23:38:49.676680Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.676680Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"MEuukhOYJ3Aq0DO3U5V0J/9p8dqAQxRhHf1Col0XgsV8V+wWr2lZuSVZBqcf2nXI8AigAGJVpua4H0P7laS8BQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.677274Z","signed_message":"canonical_sha256_bytes"},"source_id":"2406.20094","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9eba6ac6dfaa43efbf1f87e2f4fb5bdba02c5097a509b546fff62a7bdacbe00f","sha256:692479df18bd46946c3c0632fa429be86325fdac76c55d193e27c1dcb8d6749d"],"state_sha256":"d473d28e9e9d27d3ad047dfc59732b669755ed8a6506d105c11961055310385f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NaEL/CFuFpn4rVemCrUolaqXGQIs2aWSVTaESX1lZiDmW0H36LeCHwqQhjrdDiFJeAibqt3r2szKXkbc4hsSBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-31T02:12:56.253774Z","bundle_sha256":"f31b97b9d35ac71784715ebe6f7cdf3076049afc3d642094ed9266c749a042e6"}}