{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ARFJIWA23OFXTWDFMIPFJWLOT7","short_pith_number":"pith:ARFJIWA2","schema_version":"1.0","canonical_sha256":"044a94581adb8b79d865621e54d96e9fcc4cb23f05fdd87398eb97a644aa7cea","source":{"kind":"arxiv","id":"2604.04141","version":3},"attestation_state":"computed","paper":{"title":"On Data Thinning for Model Validation in Small Area Estimation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Data thinning splits area-level survey estimates into independent training and test components to validate small area estimation models without external data.","cross_cats":["math.ST","stat.AP","stat.TH"],"primary_cat":"stat.ME","authors_text":"Paul A. Parker, Sho Kawano, Zehang Richard Li","submitted_at":"2026-04-05T14:59:47Z","abstract_excerpt":"Small area estimation produces estimates of population parameters for geographic and demographic subgroups with limited sample sizes. Such estimates are critical for policy decisions, yet principled validation of these models remains a challenge. Unlike conventional predictive settings, validation data are rarely available. Data thinning splits a single observation into independent training and test components. It enables out-of-sample validation using only the area-level summary statistics routinely available, requiring only their Gaussianity and known sampling variances. However, the propert"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":true},"canonical_record":{"source":{"id":"2604.04141","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"stat.ME","submitted_at":"2026-04-05T14:59:47Z","cross_cats_sorted":["math.ST","stat.AP","stat.TH"],"title_canon_sha256":"07ac83e56ec769d8da82906a8f89c84b5eb0d8c8a97b2517dac9aad67e2b482b","abstract_canon_sha256":"e0cf238c3811ba5e9644e562ce1a376862f06c0f0119b98ec23c29c276132b9f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:12:05.756217Z","signature_b64":"cmDoalBsa3RMrwuBGXtKWd+3RbZpaZEy+e7EQeDKWUxJBUW8sDpFpGdafr4XcxhoFAh5/nd9UBY1oj0A5F5pAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"044a94581adb8b79d865621e54d96e9fcc4cb23f05fdd87398eb97a644aa7cea","last_reissued_at":"2026-06-19T16:12:05.755637Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:12:05.755637Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"On Data Thinning for Model Validation in Small Area Estimation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Data thinning splits area-level survey estimates into independent training and test components to validate small area estimation models without external data.","cross_cats":["math.ST","stat.AP","stat.TH"],"primary_cat":"stat.ME","authors_text":"Paul A. Parker, Sho Kawano, Zehang Richard Li","submitted_at":"2026-04-05T14:59:47Z","abstract_excerpt":"Small area estimation produces estimates of population parameters for geographic and demographic subgroups with limited sample sizes. Such estimates are critical for policy decisions, yet principled validation of these models remains a challenge. Unlike conventional predictive settings, validation data are rarely available. Data thinning splits a single observation into independent training and test components. It enables out-of-sample validation using only the area-level summary statistics routinely available, requiring only their Gaussianity and known sampling variances. However, the propert"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We show that data thinning with these settings provides consistent and stable performance across heterogeneous sampling designs in design-based simulations using American Community Survey microdata.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The thinned training and test components remain independent and that performance metrics on the thinned training component can be meaningfully related to full-data metrics despite targeting a different quantity, with the gap varying by model complexity.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Data thinning splits area-level observations to enable out-of-sample validation of Fay-Herriot models, with recommendations for thinning parameters that balance bias and variance for stable model comparison.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Data thinning splits area-level survey estimates into independent training and test components to validate small area estimation models without external data.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"17d883fc14ddbb95f75479bb1cd04fffeb54f01c2a24c9ba39211d8323258875"},"source":{"id":"2604.04141","kind":"arxiv","version":3},"verdict":{"id":"9cfe01b5-3944-4de9-95f6-ed54674f5574","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-13T16:47:06.465253Z","strongest_claim":"We show that data thinning with these settings provides consistent and stable performance across heterogeneous sampling designs in design-based simulations using American Community Survey microdata.","one_line_summary":"Data thinning splits area-level observations to enable out-of-sample validation of Fay-Herriot models, with recommendations for thinning parameters that balance bias and variance for stable model comparison.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The thinned training and test components remain independent and that performance metrics on the thinned training component can be meaningfully related to full-data metrics despite targeting a different quantity, with the gap varying by model complexity.","pith_extraction_headline":"Data thinning splits area-level survey estimates into independent training and test components to validate small area estimation models without external data."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.04141/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"21ff5247e9dc0be076b44d4b4960157018249b6a0ddb0c02f732f12ac1a97e00"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.04141","created_at":"2026-06-19T16:12:05.755709+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.04141v3","created_at":"2026-06-19T16:12:05.755709+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.04141","created_at":"2026-06-19T16:12:05.755709+00:00"},{"alias_kind":"pith_short_12","alias_value":"ARFJIWA23OFX","created_at":"2026-06-19T16:12:05.755709+00:00"},{"alias_kind":"pith_short_16","alias_value":"ARFJIWA23OFXTWDF","created_at":"2026-06-19T16:12:05.755709+00:00"},{"alias_kind":"pith_short_8","alias_value":"ARFJIWA2","created_at":"2026-06-19T16:12:05.755709+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2604.23464","citing_title":"Design-Based Cross-Validation for Comparing Small Area Estimators","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23464","citing_title":"Design-Based Cross-Validation for Comparing Small Area Estimators","ref_index":24,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7","json":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7.json","graph_json":"https://pith.science/api/pith-number/ARFJIWA23OFXTWDFMIPFJWLOT7/graph.json","events_json":"https://pith.science/api/pith-number/ARFJIWA23OFXTWDFMIPFJWLOT7/events.json","paper":"https://pith.science/paper/ARFJIWA2"},"agent_actions":{"view_html":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7","download_json":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7.json","view_paper":"https://pith.science/paper/ARFJIWA2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.04141&json=true","fetch_graph":"https://pith.science/api/pith-number/ARFJIWA23OFXTWDFMIPFJWLOT7/graph.json","fetch_events":"https://pith.science/api/pith-number/ARFJIWA23OFXTWDFMIPFJWLOT7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7/action/storage_attestation","attest_author":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7/action/author_attestation","sign_citation":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7/action/citation_signature","submit_replication":"https://pith.science/pith/ARFJIWA23OFXTWDFMIPFJWLOT7/action/replication_record"}},"created_at":"2026-06-19T16:12:05.755709+00:00","updated_at":"2026-06-19T16:12:05.755709+00:00"}