{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:BTZPDJO66QJ4W3LPEDJYN5XP7X","short_pith_number":"pith:BTZPDJO6","canonical_record":{"source":{"id":"2601.05242","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-08T18:59:24Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"b6bf6385df9e528004dff7db06dadda8378dad300ba2b59eae30c311d9848d4d","abstract_canon_sha256":"d646f2c556ec36788b65f958f77a2db7de5f740eceeddfc230153cfd0e2107c8"},"schema_version":"1.0"},"canonical_sha256":"0cf2f1a5def413cb6d6f20d386f6effdedcc3264e3e7081e8404e0fc3fbf4847","source":{"kind":"arxiv","id":"2601.05242","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.05242","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2601.05242v1","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.05242","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"BTZPDJO66QJ4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BTZPDJO66QJ4W3LP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BTZPDJO6","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:BTZPDJO66QJ4W3LPEDJYN5XP7X","target":"record","payload":{"canonical_record":{"source":{"id":"2601.05242","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-08T18:59:24Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"b6bf6385df9e528004dff7db06dadda8378dad300ba2b59eae30c311d9848d4d","abstract_canon_sha256":"d646f2c556ec36788b65f958f77a2db7de5f740eceeddfc230153cfd0e2107c8"},"schema_version":"1.0"},"canonical_sha256":"0cf2f1a5def413cb6d6f20d386f6effdedcc3264e3e7081e8404e0fc3fbf4847","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.386738Z","signature_b64":"BiyEUVI43PK47OX4lR80kKhdPf4TatG1Uy6H1frvTXtqfH1j38iJqMoXCKAE3Dj97xcrLGpXLJAL2QaqbCGqCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0cf2f1a5def413cb6d6f20d386f6effdedcc3264e3e7081e8404e0fc3fbf4847","last_reissued_at":"2026-05-17T23:38:53.386098Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.386098Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2601.05242","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PB7cc6aPX9vE8yOPvTxDzb24tVbjAe4hD3xVec6YixPR3fPXaASvyOPx5iYnLBuwVGVcZOmXgs59kK6Ez47iAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T18:01:36.230733Z"},"content_sha256":"b762748c1c15d5b737705788cccdde7da31ec81f047d2ae45408d9ea8aa5f2d1","schema_version":"1.0","event_id":"sha256:b762748c1c15d5b737705788cccdde7da31ec81f047d2ae45408d9ea8aa5f2d1"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:BTZPDJO66QJ4W3LPEDJYN5XP7X","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GDPO: Group reward-Decoupled Normalization Policy Optimization for Multi-reward RL Optimization","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Decoupling normalization of each reward in multi-reward RL prevents collapse of advantage values into identical signals.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Hongxu Yin, Jan Kautz, Kwang-Ting Cheng, Mingjie Liu, Min-Hung Chen, Pavlo Molchanov, Peter Belcak, Shih-Yang Liu, Shizhe Diao, Ximing Lu, Xin Dong, Yejin Choi, Yu-Chiang Frank Wang","submitted_at":"2026-01-08T18:59:24Z","abstract_excerpt":"As language models become increasingly capable, users expect them to provide not only accurate responses but also behaviors aligned with diverse human preferences across a variety of scenarios. To achieve this, Reinforcement learning (RL) pipelines have begun incorporating multiple rewards, each capturing a distinct preference, to guide models toward these desired behaviors. However, recent work has defaulted to apply Group Relative Policy Optimization (GRPO) under multi-reward setting without examining its suitability. In this paper, we demonstrate that directly applying GRPO to normalize dis"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"directly applying GRPO to normalize distinct rollout reward combinations causes them to collapse into identical advantage values, reducing the resolution of the training signal and resulting in suboptimal convergence and, in some cases, early training failure","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That separately normalizing each reward before aggregation will faithfully preserve relative differences across reward combinations without introducing new scaling artifacts or training instabilities.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GDPO decouples per-reward normalization in multi-reward RL to avoid advantage collapse and improve convergence over GRPO on tool-calling, math, and coding tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Decoupling normalization of each reward in multi-reward RL prevents collapse of advantage values into identical signals.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"75af564ee4c40e203c3617c2e8d40c07bde24a143f864db01334c51452065d93"},"source":{"id":"2601.05242","kind":"arxiv","version":1},"verdict":{"id":"11d7b481-8998-4c0f-b0dd-a6d36bf90dc7","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T05:26:51.583532Z","strongest_claim":"directly applying GRPO to normalize distinct rollout reward combinations causes them to collapse into identical advantage values, reducing the resolution of the training signal and resulting in suboptimal convergence and, in some cases, early training failure","one_line_summary":"GDPO decouples per-reward normalization in multi-reward RL to avoid advantage collapse and improve convergence over GRPO on tool-calling, math, and coding tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That separately normalizing each reward before aggregation will faithfully preserve relative differences across reward combinations without introducing new scaling artifacts or training instabilities.","pith_extraction_headline":"Decoupling normalization of each reward in multi-reward RL prevents collapse of advantage values into identical signals."},"references":{"count":46,"sample":[{"doi":"","year":2025,"title":"Learn to reason efficiently with adaptive length-based reward shaping","work_id":"6baef149-2491-431c-a612-c9fea9b25a16","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","ref_index":2,"cited_arxiv_id":"2501.12599","is_internal_anchor":true},{"doi":"","year":2024,"title":"Rule based rewards for language model safety.Advances in Neural Information Processing Systems, 37:108877–108901, 2024","work_id":"ffeb3f81-2b5a-49b6-a612-0abc709095b6","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Grpo-care: Consistency- aware reinforcement learning for multimodal reasoning, 2025","work_id":"a856fcde-b204-44aa-9e8c-074097e7c58a","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models","work_id":"07c85cc5-4086-4abc-823b-6d0f4ff784d0","ref_index":6,"cited_arxiv_id":"2512.02556","is_internal_anchor":true}],"resolved_work":46,"snapshot_sha256":"2ef5763291c9ceb1f71a4f38e1c35b92da8e4531fd7d78c77428f73e4a2bc2e1","internal_anchors":17},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"11d7b481-8998-4c0f-b0dd-a6d36bf90dc7"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"qk/Mkt51PqVIC1Ki8XPu6KFiF0AneN1uSUfmvnOXMqVGYbpJ66AA/LewF34hBzYKpOYocGvjtwuhcPdg0XKgBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T18:01:36.231303Z"},"content_sha256":"bdfab5b1d4a975df39fe27a704d1ac43b82343b5043ad0fb6ec0f09cf017016b","schema_version":"1.0","event_id":"sha256:bdfab5b1d4a975df39fe27a704d1ac43b82343b5043ad0fb6ec0f09cf017016b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/BTZPDJO66QJ4W3LPEDJYN5XP7X/bundle.json","state_url":"https://pith.science/pith/BTZPDJO66QJ4W3LPEDJYN5XP7X/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/BTZPDJO66QJ4W3LPEDJYN5XP7X/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T18:01:36Z","links":{"resolver":"https://pith.science/pith/BTZPDJO66QJ4W3LPEDJYN5XP7X","bundle":"https://pith.science/pith/BTZPDJO66QJ4W3LPEDJYN5XP7X/bundle.json","state":"https://pith.science/pith/BTZPDJO66QJ4W3LPEDJYN5XP7X/state.json","well_known_bundle":"https://pith.science/.well-known/pith/BTZPDJO66QJ4W3LPEDJYN5XP7X/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:BTZPDJO66QJ4W3LPEDJYN5XP7X","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d646f2c556ec36788b65f958f77a2db7de5f740eceeddfc230153cfd0e2107c8","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-08T18:59:24Z","title_canon_sha256":"b6bf6385df9e528004dff7db06dadda8378dad300ba2b59eae30c311d9848d4d"},"schema_version":"1.0","source":{"id":"2601.05242","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.05242","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2601.05242v1","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.05242","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"BTZPDJO66QJ4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BTZPDJO66QJ4W3LP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BTZPDJO6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:bdfab5b1d4a975df39fe27a704d1ac43b82343b5043ad0fb6ec0f09cf017016b","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"directly applying GRPO to normalize distinct rollout reward combinations causes them to collapse into identical advantage values, reducing the resolution of the training signal and resulting in suboptimal convergence and, in some cases, early training failure"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That separately normalizing each reward before aggregation will faithfully preserve relative differences across reward combinations without introducing new scaling artifacts or training instabilities."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GDPO decouples per-reward normalization in multi-reward RL to avoid advantage collapse and improve convergence over GRPO on tool-calling, math, and coding tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Decoupling normalization of each reward in multi-reward RL prevents collapse of advantage values into identical signals."}],"snapshot_sha256":"75af564ee4c40e203c3617c2e8d40c07bde24a143f864db01334c51452065d93"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"As language models become increasingly capable, users expect them to provide not only accurate responses but also behaviors aligned with diverse human preferences across a variety of scenarios. To achieve this, Reinforcement learning (RL) pipelines have begun incorporating multiple rewards, each capturing a distinct preference, to guide models toward these desired behaviors. However, recent work has defaulted to apply Group Relative Policy Optimization (GRPO) under multi-reward setting without examining its suitability. In this paper, we demonstrate that directly applying GRPO to normalize dis","authors_text":"Hongxu Yin, Jan Kautz, Kwang-Ting Cheng, Mingjie Liu, Min-Hung Chen, Pavlo Molchanov, Peter Belcak, Shih-Yang Liu, Shizhe Diao, Ximing Lu, Xin Dong, Yejin Choi, Yu-Chiang Frank Wang","cross_cats":["cs.AI","cs.LG"],"headline":"Decoupling normalization of each reward in multi-reward RL prevents collapse of advantage values into identical signals.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-08T18:59:24Z","title":"GDPO: Group reward-Decoupled Normalization Policy Optimization for Multi-reward RL Optimization"},"references":{"count":46,"internal_anchors":17,"resolved_work":46,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Learn to reason efficiently with adaptive length-based reward shaping","work_id":"6baef149-2491-431c-a612-c9fea9b25a16","year":2025},{"cited_arxiv_id":"2501.12599","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Rule based rewards for language model safety.Advances in Neural Information Processing Systems, 37:108877–108901, 2024","work_id":"ffeb3f81-2b5a-49b6-a612-0abc709095b6","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Grpo-care: Consistency- aware reinforcement learning for multimodal reasoning, 2025","work_id":"a856fcde-b204-44aa-9e8c-074097e7c58a","year":2025},{"cited_arxiv_id":"2512.02556","doi":"","is_internal_anchor":true,"ref_index":6,"title":"DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models","work_id":"07c85cc5-4086-4abc-823b-6d0f4ff784d0","year":2025}],"snapshot_sha256":"2ef5763291c9ceb1f71a4f38e1c35b92da8e4531fd7d78c77428f73e4a2bc2e1"},"source":{"id":"2601.05242","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T05:26:51.583532Z","id":"11d7b481-8998-4c0f-b0dd-a6d36bf90dc7","model_set":{"reader":"grok-4.3"},"one_line_summary":"GDPO decouples per-reward normalization in multi-reward RL to avoid advantage collapse and improve convergence over GRPO on tool-calling, math, and coding tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Decoupling normalization of each reward in multi-reward RL prevents collapse of advantage values into identical signals.","strongest_claim":"directly applying GRPO to normalize distinct rollout reward combinations causes them to collapse into identical advantage values, reducing the resolution of the training signal and resulting in suboptimal convergence and, in some cases, early training failure","weakest_assumption":"That separately normalizing each reward before aggregation will faithfully preserve relative differences across reward combinations without introducing new scaling artifacts or training instabilities."}},"verdict_id":"11d7b481-8998-4c0f-b0dd-a6d36bf90dc7"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b762748c1c15d5b737705788cccdde7da31ec81f047d2ae45408d9ea8aa5f2d1","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d646f2c556ec36788b65f958f77a2db7de5f740eceeddfc230153cfd0e2107c8","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-08T18:59:24Z","title_canon_sha256":"b6bf6385df9e528004dff7db06dadda8378dad300ba2b59eae30c311d9848d4d"},"schema_version":"1.0","source":{"id":"2601.05242","kind":"arxiv","version":1}},"canonical_sha256":"0cf2f1a5def413cb6d6f20d386f6effdedcc3264e3e7081e8404e0fc3fbf4847","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0cf2f1a5def413cb6d6f20d386f6effdedcc3264e3e7081e8404e0fc3fbf4847","first_computed_at":"2026-05-17T23:38:53.386098Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.386098Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"BiyEUVI43PK47OX4lR80kKhdPf4TatG1Uy6H1frvTXtqfH1j38iJqMoXCKAE3Dj97xcrLGpXLJAL2QaqbCGqCg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.386738Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.05242","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b762748c1c15d5b737705788cccdde7da31ec81f047d2ae45408d9ea8aa5f2d1","sha256:bdfab5b1d4a975df39fe27a704d1ac43b82343b5043ad0fb6ec0f09cf017016b"],"state_sha256":"78c05c05d7145f6878bbed53d0e5ba04d631624c1d8070bad14333c8ceaeeb23"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cYtUyyCWlzlw2TjnXbU3GN39g7dRH2zuOi5wp59Iwc7Aw5XUoJ/6uC1gmcxFr7WcDcux6PX3TnruDxgXjOQHCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T18:01:36.233687Z","bundle_sha256":"e7f189b98712f0714c7731190309bb5dabd6ac08c79ff41c8a98c7db64d17510"}}