{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:NN7MTSKTR63RY4XI3QIOBYO7NX","short_pith_number":"pith:NN7MTSKT","schema_version":"1.0","canonical_sha256":"6b7ec9c9538fb71c72e8dc10e0e1df6ddd2f3deca94d38b45c02b621c2515648","source":{"kind":"arxiv","id":"2510.09222","version":3},"attestation_state":"computed","paper":{"title":"FM-IRL: Flow-Matching for Reward Modeling and Policy Regularization in Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Bo An, Chubin Zhang, Ivor Tsang, Jingxuan Wu, Mingcong Lei, Xingrui Yu, Zhenglin Wan","submitted_at":"2025-10-10T10:08:10Z","abstract_excerpt":"Flow Matching (FM) has shown remarkable ability in modeling complex distributions and achieves strong performance in offline imitation learning for cloning expert behaviors. However, despite its behavioral cloning expressiveness, FM-based policies are inherently limited by their lack of environmental interaction and exploration. This leads to poor generalization in unseen scenarios beyond the expert demonstrations, underscoring the necessity of online interaction with environment. Unfortunately, optimizing FM policies via online interaction is challenging and inefficient due to instability in "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.09222","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-10-10T10:08:10Z","cross_cats_sorted":[],"title_canon_sha256":"3f4756b61cadb5c6e8b954a00f94ff679fdf190341cd5a2bf0de72c9c36405e6","abstract_canon_sha256":"3e8d5ac148d9905c65cc0ca3426b3e5d0621efb5ac17f6c190f6e9d381838441"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:48.330813Z","signature_b64":"ypwc9/BwIvurCyp6RC+iFxlo1qDaU4J9D4qbYt1e87l8nbJxI3ZZnUP3Dhcwf0YRV2iRKoifp2yWQzXAu4a3Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6b7ec9c9538fb71c72e8dc10e0e1df6ddd2f3deca94d38b45c02b621c2515648","last_reissued_at":"2026-06-02T02:04:48.330271Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:48.330271Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FM-IRL: Flow-Matching for Reward Modeling and Policy Regularization in Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Bo An, Chubin Zhang, Ivor Tsang, Jingxuan Wu, Mingcong Lei, Xingrui Yu, Zhenglin Wan","submitted_at":"2025-10-10T10:08:10Z","abstract_excerpt":"Flow Matching (FM) has shown remarkable ability in modeling complex distributions and achieves strong performance in offline imitation learning for cloning expert behaviors. However, despite its behavioral cloning expressiveness, FM-based policies are inherently limited by their lack of environmental interaction and exploration. This leads to poor generalization in unseen scenarios beyond the expert demonstrations, underscoring the necessity of online interaction with environment. Unfortunately, optimizing FM policies via online interaction is challenging and inefficient due to instability in "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.09222","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.09222/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.09222","created_at":"2026-06-02T02:04:48.330343+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.09222v3","created_at":"2026-06-02T02:04:48.330343+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.09222","created_at":"2026-06-02T02:04:48.330343+00:00"},{"alias_kind":"pith_short_12","alias_value":"NN7MTSKTR63R","created_at":"2026-06-02T02:04:48.330343+00:00"},{"alias_kind":"pith_short_16","alias_value":"NN7MTSKTR63RY4XI","created_at":"2026-06-02T02:04:48.330343+00:00"},{"alias_kind":"pith_short_8","alias_value":"NN7MTSKT","created_at":"2026-06-02T02:04:48.330343+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX","json":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX.json","graph_json":"https://pith.science/api/pith-number/NN7MTSKTR63RY4XI3QIOBYO7NX/graph.json","events_json":"https://pith.science/api/pith-number/NN7MTSKTR63RY4XI3QIOBYO7NX/events.json","paper":"https://pith.science/paper/NN7MTSKT"},"agent_actions":{"view_html":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX","download_json":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX.json","view_paper":"https://pith.science/paper/NN7MTSKT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.09222&json=true","fetch_graph":"https://pith.science/api/pith-number/NN7MTSKTR63RY4XI3QIOBYO7NX/graph.json","fetch_events":"https://pith.science/api/pith-number/NN7MTSKTR63RY4XI3QIOBYO7NX/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX/action/storage_attestation","attest_author":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX/action/author_attestation","sign_citation":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX/action/citation_signature","submit_replication":"https://pith.science/pith/NN7MTSKTR63RY4XI3QIOBYO7NX/action/replication_record"}},"created_at":"2026-06-02T02:04:48.330343+00:00","updated_at":"2026-06-02T02:04:48.330343+00:00"}