{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:MNRT44NCPOYYU67PKIMKX5PQGQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"86b1395a9b4c17ba6d4e6287b494cf40d09a6b0572d0ee56343a68fda07cab40","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-29T21:16:25Z","title_canon_sha256":"0e4ac08b0fbebd2f338b6cf07b7047d66b87477d16dfdb5f1a47976812959775"},"schema_version":"1.0","source":{"id":"2606.00367","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.00367","created_at":"2026-06-02T01:03:52Z"},{"alias_kind":"arxiv_version","alias_value":"2606.00367v1","created_at":"2026-06-02T01:03:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.00367","created_at":"2026-06-02T01:03:52Z"},{"alias_kind":"pith_short_12","alias_value":"MNRT44NCPOYY","created_at":"2026-06-02T01:03:52Z"},{"alias_kind":"pith_short_16","alias_value":"MNRT44NCPOYYU67P","created_at":"2026-06-02T01:03:52Z"},{"alias_kind":"pith_short_8","alias_value":"MNRT44NC","created_at":"2026-06-02T01:03:52Z"}],"graph_snapshots":[{"event_id":"sha256:32c08d41571de33bb3aa8cd0444b0b8026f390680e4b121a2b233a69449bd7da","target":"graph","created_at":"2026-06-02T01:03:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.00367/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement learning problems typically define the goal as maximizing the expected value of a scalar reward function. But, pairwise preferences are often easier to specify than scalar rewards, and they express certain goals that scalar rewards cannot. Methods for reinforcement learning with pairwise preferences have thus received growing interest. Unfortunately, these methods are inefficient in problems with long time horizons, and they lack guarantees on the performance of Markov policies relative to history-dependent policies, which bridge the theory and practice of reinforcement learning.","authors_text":"Benjamin Van Roy, Doina Precup, Jonathan Cola\\c{c}o Carr, Prakash Panangaden","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-29T21:16:25Z","title":"Reinforcement Learning with Pairwise Preferences in Long-Term Decision Problems"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.00367","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f0bc8a4413b368212996b62bc89cc28fd33e176bc12bc9e526c875bc42cc8c82","target":"record","created_at":"2026-06-02T01:03:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"86b1395a9b4c17ba6d4e6287b494cf40d09a6b0572d0ee56343a68fda07cab40","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-29T21:16:25Z","title_canon_sha256":"0e4ac08b0fbebd2f338b6cf07b7047d66b87477d16dfdb5f1a47976812959775"},"schema_version":"1.0","source":{"id":"2606.00367","kind":"arxiv","version":1}},"canonical_sha256":"63633e71a27bb18a7bef5218abf5f03425e7a116667660bcb43721e804388551","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"63633e71a27bb18a7bef5218abf5f03425e7a116667660bcb43721e804388551","first_computed_at":"2026-06-02T01:03:52.799595Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T01:03:52.799595Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"onHf58axR5wgD8PgwmUYqyd9vHALEYUhK60ilnsN4d0XEfKzdfrHRAxqyNls4b1eEj6Py2UIwYf+llwkRSVjBg==","signature_status":"signed_v1","signed_at":"2026-06-02T01:03:52.799979Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.00367","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f0bc8a4413b368212996b62bc89cc28fd33e176bc12bc9e526c875bc42cc8c82","sha256:32c08d41571de33bb3aa8cd0444b0b8026f390680e4b121a2b233a69449bd7da"],"state_sha256":"ef993b9f0bb81f0c6e332d8dbe2fe41e633eef2d839de7ff7c91b190aeb7c045"}