{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2019:EJSE4ERADDCWYELUJ5CUSBYCJA","short_pith_number":"pith:EJSE4ERA","canonical_record":{"source":{"id":"1907.08823","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-07-20T15:21:11Z","cross_cats_sorted":["cs.AI","cs.SY","eess.SY","stat.ML"],"title_canon_sha256":"1afcab87eff071af58e3463e4863694778853d58f3bcfb60cfbc06739c68a06e","abstract_canon_sha256":"33baad9d4f628e46109d7a944b9e6079df16387749accb4813067d63f0cccff6"},"schema_version":"1.0"},"canonical_sha256":"22644e122018c56c11744f45490702482835282dcf731e01f34455b0b43b8eb4","source":{"kind":"arxiv","id":"1907.08823","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1907.08823","created_at":"2026-05-17T23:40:05Z"},{"alias_kind":"arxiv_version","alias_value":"1907.08823v1","created_at":"2026-05-17T23:40:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1907.08823","created_at":"2026-05-17T23:40:05Z"},{"alias_kind":"pith_short_12","alias_value":"EJSE4ERADDCW","created_at":"2026-05-18T12:33:15Z"},{"alias_kind":"pith_short_16","alias_value":"EJSE4ERADDCWYELU","created_at":"2026-05-18T12:33:15Z"},{"alias_kind":"pith_short_8","alias_value":"EJSE4ERA","created_at":"2026-05-18T12:33:15Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2019:EJSE4ERADDCWYELUJ5CUSBYCJA","target":"record","payload":{"canonical_record":{"source":{"id":"1907.08823","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-07-20T15:21:11Z","cross_cats_sorted":["cs.AI","cs.SY","eess.SY","stat.ML"],"title_canon_sha256":"1afcab87eff071af58e3463e4863694778853d58f3bcfb60cfbc06739c68a06e","abstract_canon_sha256":"33baad9d4f628e46109d7a944b9e6079df16387749accb4813067d63f0cccff6"},"schema_version":"1.0"},"canonical_sha256":"22644e122018c56c11744f45490702482835282dcf731e01f34455b0b43b8eb4","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:40:05.247355Z","signature_b64":"VrKHANbrpsF81he05IYGNMLXK53/VMbvBXksmAOB77ekcI4PKiQggpQpEP3oMxu21qsFvBZGDwiFVLyigLH2Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"22644e122018c56c11744f45490702482835282dcf731e01f34455b0b43b8eb4","last_reissued_at":"2026-05-17T23:40:05.246869Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:40:05.246869Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1907.08823","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:40:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cQFxWGfZCJGpZvWFijX0XwuhTt1mA+eQGcVKkwUTaO0uHt4l1LIqc8WqYWAZZu7BnSn675r2l7oJx4kWekVEDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T17:28:12.097402Z"},"content_sha256":"da18e9dfca67697ba850dbc4bf86b86c86d7acce5abfe749daeb259c1ebc673b","schema_version":"1.0","event_id":"sha256:da18e9dfca67697ba850dbc4bf86b86c86d7acce5abfe749daeb259c1ebc673b"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2019:EJSE4ERADDCWYELUJ5CUSBYCJA","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Potential-Based Advice for Stochastic Policy Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.SY","eess.SY","stat.ML"],"primary_cat":"cs.LG","authors_text":"Andrew Clark, Baicen Xiao, Bhaskar Ramasubramanian, Hannaneh Hajishirzi, Linda Bushnell, Radha Poovendran","submitted_at":"2019-07-20T15:21:11Z","abstract_excerpt":"This paper augments the reward received by a reinforcement learning agent with potential functions in order to help the agent learn (possibly stochastic) optimal policies. We show that a potential-based reward shaping scheme is able to preserve optimality of stochastic policies, and demonstrate that the ability of an agent to learn an optimal policy is not affected when this scheme is augmented to soft Q-learning. We propose a method to impart potential based advice schemes to policy gradient algorithms. An algorithm that considers an advantage actor-critic architecture augmented with this sch"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1907.08823","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:40:05Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OFYvXhVHGkudab7FCdIcq6Sz86laUdyw9wU2o/K+V5dK5y4lkqDIvxHq6da0wKavTJY8yq9PHaPXB4MTa4rYDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T17:28:12.097837Z"},"content_sha256":"b807d0c3218828384b8a1d39f1db05aed974586d548f39fe95022063f4f2b8d9","schema_version":"1.0","event_id":"sha256:b807d0c3218828384b8a1d39f1db05aed974586d548f39fe95022063f4f2b8d9"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/EJSE4ERADDCWYELUJ5CUSBYCJA/bundle.json","state_url":"https://pith.science/pith/EJSE4ERADDCWYELUJ5CUSBYCJA/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/EJSE4ERADDCWYELUJ5CUSBYCJA/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T17:28:12Z","links":{"resolver":"https://pith.science/pith/EJSE4ERADDCWYELUJ5CUSBYCJA","bundle":"https://pith.science/pith/EJSE4ERADDCWYELUJ5CUSBYCJA/bundle.json","state":"https://pith.science/pith/EJSE4ERADDCWYELUJ5CUSBYCJA/state.json","well_known_bundle":"https://pith.science/.well-known/pith/EJSE4ERADDCWYELUJ5CUSBYCJA/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2019:EJSE4ERADDCWYELUJ5CUSBYCJA","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"33baad9d4f628e46109d7a944b9e6079df16387749accb4813067d63f0cccff6","cross_cats_sorted":["cs.AI","cs.SY","eess.SY","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-07-20T15:21:11Z","title_canon_sha256":"1afcab87eff071af58e3463e4863694778853d58f3bcfb60cfbc06739c68a06e"},"schema_version":"1.0","source":{"id":"1907.08823","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1907.08823","created_at":"2026-05-17T23:40:05Z"},{"alias_kind":"arxiv_version","alias_value":"1907.08823v1","created_at":"2026-05-17T23:40:05Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1907.08823","created_at":"2026-05-17T23:40:05Z"},{"alias_kind":"pith_short_12","alias_value":"EJSE4ERADDCW","created_at":"2026-05-18T12:33:15Z"},{"alias_kind":"pith_short_16","alias_value":"EJSE4ERADDCWYELU","created_at":"2026-05-18T12:33:15Z"},{"alias_kind":"pith_short_8","alias_value":"EJSE4ERA","created_at":"2026-05-18T12:33:15Z"}],"graph_snapshots":[{"event_id":"sha256:b807d0c3218828384b8a1d39f1db05aed974586d548f39fe95022063f4f2b8d9","target":"graph","created_at":"2026-05-17T23:40:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"This paper augments the reward received by a reinforcement learning agent with potential functions in order to help the agent learn (possibly stochastic) optimal policies. We show that a potential-based reward shaping scheme is able to preserve optimality of stochastic policies, and demonstrate that the ability of an agent to learn an optimal policy is not affected when this scheme is augmented to soft Q-learning. We propose a method to impart potential based advice schemes to policy gradient algorithms. An algorithm that considers an advantage actor-critic architecture augmented with this sch","authors_text":"Andrew Clark, Baicen Xiao, Bhaskar Ramasubramanian, Hannaneh Hajishirzi, Linda Bushnell, Radha Poovendran","cross_cats":["cs.AI","cs.SY","eess.SY","stat.ML"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-07-20T15:21:11Z","title":"Potential-Based Advice for Stochastic Policy Learning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1907.08823","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:da18e9dfca67697ba850dbc4bf86b86c86d7acce5abfe749daeb259c1ebc673b","target":"record","created_at":"2026-05-17T23:40:05Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"33baad9d4f628e46109d7a944b9e6079df16387749accb4813067d63f0cccff6","cross_cats_sorted":["cs.AI","cs.SY","eess.SY","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-07-20T15:21:11Z","title_canon_sha256":"1afcab87eff071af58e3463e4863694778853d58f3bcfb60cfbc06739c68a06e"},"schema_version":"1.0","source":{"id":"1907.08823","kind":"arxiv","version":1}},"canonical_sha256":"22644e122018c56c11744f45490702482835282dcf731e01f34455b0b43b8eb4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"22644e122018c56c11744f45490702482835282dcf731e01f34455b0b43b8eb4","first_computed_at":"2026-05-17T23:40:05.246869Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:40:05.246869Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"VrKHANbrpsF81he05IYGNMLXK53/VMbvBXksmAOB77ekcI4PKiQggpQpEP3oMxu21qsFvBZGDwiFVLyigLH2Cw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:40:05.247355Z","signed_message":"canonical_sha256_bytes"},"source_id":"1907.08823","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:da18e9dfca67697ba850dbc4bf86b86c86d7acce5abfe749daeb259c1ebc673b","sha256:b807d0c3218828384b8a1d39f1db05aed974586d548f39fe95022063f4f2b8d9"],"state_sha256":"1cb5366632f87a51942ae8620832f49f96de948b077f23d03305f84ceda6f35c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nvfr5pc6AMEiGG+H2EsJD14zBlCjPivLRfVMw8Y8NNWCXWsAh8ZfTJqZsNGVLtb3mvlen6+SF3uvJzSWycOUAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T17:28:12.101650Z","bundle_sha256":"90485a2859d1fab9da717b9b999d3b2f103e530bfa8aea6a903c0ca40c71a28d"}}