{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2019:565QSKTY4XGIKLYWDFF5SXVFD7","short_pith_number":"pith:565QSKTY","canonical_record":{"source":{"id":"1905.01756","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-05-05T21:51:27Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"c78c21caba6337587b6dbc5aeb3a6cb367f189bb5781b2e988d66506f4143738","abstract_canon_sha256":"82ded8f708789958715296b402592172d85d3d73d473bbc2bb256e4e93a6592a"},"schema_version":"1.0"},"canonical_sha256":"efbb092a78e5cc852f16194bd95ea51fd5fb784f0c4a794f3a6ef2bc253d760a","source":{"kind":"arxiv","id":"1905.01756","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1905.01756","created_at":"2026-05-17T23:40:32Z"},{"alias_kind":"arxiv_version","alias_value":"1905.01756v2","created_at":"2026-05-17T23:40:32Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1905.01756","created_at":"2026-05-17T23:40:32Z"},{"alias_kind":"pith_short_12","alias_value":"565QSKTY4XGI","created_at":"2026-05-18T12:33:10Z"},{"alias_kind":"pith_short_16","alias_value":"565QSKTY4XGIKLYW","created_at":"2026-05-18T12:33:10Z"},{"alias_kind":"pith_short_8","alias_value":"565QSKTY","created_at":"2026-05-18T12:33:10Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2019:565QSKTY4XGIKLYWDFF5SXVFD7","target":"record","payload":{"canonical_record":{"source":{"id":"1905.01756","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-05-05T21:51:27Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"c78c21caba6337587b6dbc5aeb3a6cb367f189bb5781b2e988d66506f4143738","abstract_canon_sha256":"82ded8f708789958715296b402592172d85d3d73d473bbc2bb256e4e93a6592a"},"schema_version":"1.0"},"canonical_sha256":"efbb092a78e5cc852f16194bd95ea51fd5fb784f0c4a794f3a6ef2bc253d760a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:40:32.158480Z","signature_b64":"FZ0aZFEZkdVubmPydB6uCksg9KJ5oB+qxTDzazgzkK29sLlHu+XNAE0RsSbSGm+A1+mUK/NrofMV6+8obcRjAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"efbb092a78e5cc852f16194bd95ea51fd5fb784f0c4a794f3a6ef2bc253d760a","last_reissued_at":"2026-05-17T23:40:32.157737Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:40:32.157737Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1905.01756","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:40:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VwViYEffngtEVj6RYcLDTn1iStoWbt/MOOS0BCxP8NWwTrQ57/XMv6jZefdRuWBcsjclr82dzWR6Zj+oe74aDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T19:20:27.998271Z"},"content_sha256":"12167eb9b477b847ee348bcc130bfcd92bfa79650cbb53e96c8f806db629af46","schema_version":"1.0","event_id":"sha256:12167eb9b477b847ee348bcc130bfcd92bfa79650cbb53e96c8f806db629af46"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2019:565QSKTY4XGIKLYWDFF5SXVFD7","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"P3O: Policy-on Policy-off Policy Optimization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Alexander J. Smola, Pratik Chaudhari, Rasool Fakoor","submitted_at":"2019-05-05T21:51:27Z","abstract_excerpt":"On-policy reinforcement learning (RL) algorithms have high sample complexity while off-policy algorithms are difficult to tune. Merging the two holds the promise to develop efficient algorithms that generalize across diverse environments. It is however challenging in practice to find suitable hyper-parameters that govern this trade off. This paper develops a simple algorithm named P3O that interleaves off-policy updates with on-policy updates. P3O uses the effective sample size between the behavior policy and the target policy to control how far they can be from each other and does not introdu"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1905.01756","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:40:32Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cD9wUqEY54xlvSEJfl14hNI44jFEGpc25nR8bYp/iV5ZECNOOT42Rb2OmU293v2q64sPBRZCJTUmFDGASm48DQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T19:20:27.998639Z"},"content_sha256":"a85afbf6adcb337d28e4818a0a5c31612f64adfd9ff91d6ce5396ae2937cbea5","schema_version":"1.0","event_id":"sha256:a85afbf6adcb337d28e4818a0a5c31612f64adfd9ff91d6ce5396ae2937cbea5"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/565QSKTY4XGIKLYWDFF5SXVFD7/bundle.json","state_url":"https://pith.science/pith/565QSKTY4XGIKLYWDFF5SXVFD7/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/565QSKTY4XGIKLYWDFF5SXVFD7/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T19:20:27Z","links":{"resolver":"https://pith.science/pith/565QSKTY4XGIKLYWDFF5SXVFD7","bundle":"https://pith.science/pith/565QSKTY4XGIKLYWDFF5SXVFD7/bundle.json","state":"https://pith.science/pith/565QSKTY4XGIKLYWDFF5SXVFD7/state.json","well_known_bundle":"https://pith.science/.well-known/pith/565QSKTY4XGIKLYWDFF5SXVFD7/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2019:565QSKTY4XGIKLYWDFF5SXVFD7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"82ded8f708789958715296b402592172d85d3d73d473bbc2bb256e4e93a6592a","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-05-05T21:51:27Z","title_canon_sha256":"c78c21caba6337587b6dbc5aeb3a6cb367f189bb5781b2e988d66506f4143738"},"schema_version":"1.0","source":{"id":"1905.01756","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1905.01756","created_at":"2026-05-17T23:40:32Z"},{"alias_kind":"arxiv_version","alias_value":"1905.01756v2","created_at":"2026-05-17T23:40:32Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1905.01756","created_at":"2026-05-17T23:40:32Z"},{"alias_kind":"pith_short_12","alias_value":"565QSKTY4XGI","created_at":"2026-05-18T12:33:10Z"},{"alias_kind":"pith_short_16","alias_value":"565QSKTY4XGIKLYW","created_at":"2026-05-18T12:33:10Z"},{"alias_kind":"pith_short_8","alias_value":"565QSKTY","created_at":"2026-05-18T12:33:10Z"}],"graph_snapshots":[{"event_id":"sha256:a85afbf6adcb337d28e4818a0a5c31612f64adfd9ff91d6ce5396ae2937cbea5","target":"graph","created_at":"2026-05-17T23:40:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"On-policy reinforcement learning (RL) algorithms have high sample complexity while off-policy algorithms are difficult to tune. Merging the two holds the promise to develop efficient algorithms that generalize across diverse environments. It is however challenging in practice to find suitable hyper-parameters that govern this trade off. This paper develops a simple algorithm named P3O that interleaves off-policy updates with on-policy updates. P3O uses the effective sample size between the behavior policy and the target policy to control how far they can be from each other and does not introdu","authors_text":"Alexander J. Smola, Pratik Chaudhari, Rasool Fakoor","cross_cats":["stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-05-05T21:51:27Z","title":"P3O: Policy-on Policy-off Policy Optimization"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1905.01756","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:12167eb9b477b847ee348bcc130bfcd92bfa79650cbb53e96c8f806db629af46","target":"record","created_at":"2026-05-17T23:40:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"82ded8f708789958715296b402592172d85d3d73d473bbc2bb256e4e93a6592a","cross_cats_sorted":["stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-05-05T21:51:27Z","title_canon_sha256":"c78c21caba6337587b6dbc5aeb3a6cb367f189bb5781b2e988d66506f4143738"},"schema_version":"1.0","source":{"id":"1905.01756","kind":"arxiv","version":2}},"canonical_sha256":"efbb092a78e5cc852f16194bd95ea51fd5fb784f0c4a794f3a6ef2bc253d760a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"efbb092a78e5cc852f16194bd95ea51fd5fb784f0c4a794f3a6ef2bc253d760a","first_computed_at":"2026-05-17T23:40:32.157737Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:40:32.157737Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"FZ0aZFEZkdVubmPydB6uCksg9KJ5oB+qxTDzazgzkK29sLlHu+XNAE0RsSbSGm+A1+mUK/NrofMV6+8obcRjAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:40:32.158480Z","signed_message":"canonical_sha256_bytes"},"source_id":"1905.01756","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:12167eb9b477b847ee348bcc130bfcd92bfa79650cbb53e96c8f806db629af46","sha256:a85afbf6adcb337d28e4818a0a5c31612f64adfd9ff91d6ce5396ae2937cbea5"],"state_sha256":"6f1012b0a5c12d294d6644c161efd72061be8feacb0bbb58c671a8b288b40508"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"FgKu77iASbqxoT4+c8uIaqwvnBapfVXT1CI+KKTAVCZ35SO6o93/74tk4AJvnSWNLlHVzqWW5N5Oy9FYTnpVDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T19:20:28.001222Z","bundle_sha256":"d74fdccf57d70f7dd927e7c512a045c92259abad0622032cd0ffcfc177f020e8"}}