{"work":{"id":"240c67fe-d14d-4520-91c1-38a4e272ca19","openalex_id":null,"doi":null,"arxiv_id":"1707.06347","raw_key":null,"title":"Proximal Policy Optimization Algorithms","authors":null,"authors_text":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov","year":2017,"venue":"cs.LG","abstract":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.","external_url":"https://arxiv.org/abs/1707.06347","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T19:51:10.738293+00:00","pith_arxiv_id":"1707.06347","created_at":"2026-05-08T18:44:01.332768+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":true,"display_title":"Proximal Policy Optimization Algorithms","render_title":"Proximal Policy Optimization Algorithms"},"hub":{"state":{"work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1295,"external_cited_by_count":null,"distinct_field_count":45,"first_pith_cited_at":"2017-10-16T18:05:45+00:00","last_pith_cited_at":"2026-05-22T17:59:43+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-06-19T21:38:04.887199+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":152},{"context_role":"method","n":113},{"context_role":"baseline","n":15},{"context_role":"dataset","n":4}],"polarity_counts":[{"context_polarity":"background","n":147},{"context_polarity":"use_method","n":109},{"context_polarity":"baseline","n":15},{"context_polarity":"unclear","n":7},{"context_polarity":"use_dataset","n":4},{"context_polarity":"support","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:29.701873+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"298fbbc8-0497-4319-a5d8-d4bcef3f7f3d","orcid":null,"display_name":"John Schulman"},{"id":"31b0570b-235b-4790-add5-bb4bd6cd4ecb","orcid":null,"display_name":"Filip Wolski"},{"id":"be640ed3-1548-4b69-a2ac-76fa45adb08b","orcid":null,"display_name":"Prafulla Dhariwal"},{"id":"9783c850-24f9-4444-91ee-29b7660c744b","orcid":null,"display_name":"Alec Radford"},{"id":"c2a2eca0-5637-4573-bf0c-fc8cadee54bf","orcid":null,"display_name":"Oleg Klimov"}]},"error":null,"updated_at":"2026-05-13T17:24:04.648649+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T17:43:36.017501+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":137},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":117},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":116},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":75},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":65},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":65},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":56},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":54},{"title":"High-Dimensional Continuous Control Using Generalized Advantage Estimation","work_id":"38e3ca94-96f0-4b19-a355-0754931af8be","shared_citers":52},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":51},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":48},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":47},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":43},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":39},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":36},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":33},{"title":"Fine-Tuning Language Models from Human Preferences","work_id":"4f54aad1-f3b6-404f-b9c7-e21ba0a33b99","shared_citers":32},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":32},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":31},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":31},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":29},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":29}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2018},{"n":4,"year":2019},{"n":1,"year":2020},{"n":2,"year":2021},{"n":1,"year":2022},{"n":9,"year":2023},{"n":6,"year":2024},{"n":18,"year":2025},{"n":591,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:54.296292+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T17:43:35.301807+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-05-19T18:11:51.668023+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Gravity.PropagationSpeed","IndisputableMonolith.Foundation.PreTemporalForcingOrder","IndisputableMonolith.Physics.LightConeCausalityFromRS","IndisputableMonolith.Cosmology.EtaBPrefactorDerivation","IndisputableMonolith.Physics.MaxwellEquationsFromRS","IndisputableMonolith.Gravity.BlackHoleEntropyFromLedger","IndisputableMonolith.Thermodynamics.FermiDirac","IndisputableMonolith.Gravity.BlackHoleHorizonStates"],"query_chars":984},"error":null,"updated_at":"2026-05-19T18:11:51.666403+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:53:29.698764+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.708624+00:00"}},"summary":{"title":"Proximal Policy Optimization Algorithms","claims":[{"claim_text":"We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a \"surrogate\" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more ge","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Proximal Policy Optimization Algorithms because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":255},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":137},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":117},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":116},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":75},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":65},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":65},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":56},{"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","shared_citers":54},{"title":"High-Dimensional Continuous Control Using Generalized Advantage Estimation","work_id":"38e3ca94-96f0-4b19-a355-0754931af8be","shared_citers":52},{"title":"Understanding R1-Zero-Like Training: A Critical Perspective","work_id":"ec354f3b-9484-4a0c-94c8-92d4d0260835","shared_citers":51},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":48},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":47},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":43},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":39},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":36},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":33},{"title":"Fine-Tuning Language Models from Human Preferences","work_id":"4f54aad1-f3b6-404f-b9c7-e21ba0a33b99","shared_citers":32},{"title":"Kimi k1.5: Scaling Reinforcement Learning with LLMs","work_id":"bff96ab1-bd6a-4585-be23-74fdb51969c7","shared_citers":32},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":32},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":31},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":31},{"title":"HybridFlow: A Flexible and Efficient RLHF Framework","work_id":"7eb9c9f4-b322-4bba-8011-09ff8d6ad801","shared_citers":29},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":29}],"time_series":[{"n":1,"year":2017},{"n":1,"year":2018},{"n":4,"year":2019},{"n":1,"year":2020},{"n":2,"year":2021},{"n":1,"year":2022},{"n":9,"year":2023},{"n":6,"year":2024},{"n":18,"year":2025},{"n":591,"year":2026}]},"authors":[{"id":"9783c850-24f9-4444-91ee-29b7660c744b","orcid":null,"display_name":"Alec Radford","source":"manual","import_confidence":0.72},{"id":"31b0570b-235b-4790-add5-bb4bd6cd4ecb","orcid":null,"display_name":"Filip Wolski","source":"manual","import_confidence":0.72},{"id":"298fbbc8-0497-4319-a5d8-d4bcef3f7f3d","orcid":null,"display_name":"John Schulman","source":"manual","import_confidence":0.72},{"id":"c2a2eca0-5637-4573-bf0c-fc8cadee54bf","orcid":null,"display_name":"Oleg Klimov","source":"manual","import_confidence":0.72},{"id":"be640ed3-1548-4b69-a2ac-76fa45adb08b","orcid":null,"display_name":"Prafulla Dhariwal","source":"manual","import_confidence":0.72}]},"citers":{"total":1295,"items":[{"citing_arxiv_id":"2605.23903","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geo-Align: Video Generation Alignment via Metric Geometry Reward","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Geo-Align applies RL with a perceptual reward derived from 3D camera trajectory estimation to improve controllability and fidelity in video generation without paired training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23863","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robotic Strawberry Harvesting with Robust Vision and Deep Reinforcement Learning based Sim-to-Real Control","primary_cat":"cs.RO","submitted_at":"2026-05-22T17:21:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A modified YOLO segmentation model plus sim-trained PPO control yields 84.3% overall success harvesting 281 strawberries in greenhouse trials on a real UR10e manipulator.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23762","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Direct Dynamic Retargeting for Humanoid Imitation Learning from Videos","primary_cat":"cs.RO","submitted_at":"2026-05-22T15:33:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DDR is a single-stage task-space framework using sampling-based MPC in a physics simulator to produce high-fidelity dynamically feasible references from video demos, claimed to outperform geometric and indirect retargeting baselines in tracking accuracy and to speed up RL training for agile humanoid","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23717","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vision-Based Agile Landing on Turbulent Waters","primary_cat":"cs.RO","submitted_at":"2026-05-22T14:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reinforcement learning policy trained on synthetic visual features in simulation enables zero-shot real-world agile multirotor landing on turbulent maritime platforms without explicit platform-state estimation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23652","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One Policy, Infinite NPCs: Persona-Traceable Shared RL Policies for Scalable Game Agents","primary_cat":"cs.AI","submitted_at":"2026-05-22T14:04:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"pcsp is a shared RL policy using LLM persona embeddings, low-rank projection, and PPO+InfoNCE+KL training that delivers 17x above-chance zero-shot persona identification and 22x faster inference on a 300-persona benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23565","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding Goal Generalisation in Sequential Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-22T12:31:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical analysis of over 100 sequential RL training pipelines across 250+ OOD environments finds salient features drive generalization and early goals persist, with latent policy gradients simulating latent variable evolution to predict OOD behavior from training history.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23560","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SafeSABR: Risk-Calibrated Adaptive Bitrate Streaming over Starlink Networks","primary_cat":"eess.SY","submitted_at":"2026-05-22T12:27:47+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23551","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Goal-Conditioned Agents that Learn Everything All at Once","primary_cat":"cs.LG","submitted_at":"2026-05-22T12:17:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LEO enables efficient all-goals learning in goal-conditioned RL by jointly predicting for all goals in one network pass, yielding >250x speedup over relabelling and better performance on Craftax.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23493","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EDGE-OPD: Internalizing Privileged Context with Evidence Guided On-Policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-05-22T10:55:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EDGE-OPD adds guided rollouts and evidence masking to on-policy self-distillation, enabling successful learning of target identities where standard OPSD and RLSD fail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23463","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StepAudio 2.5 Technical Report","primary_cat":"eess.AS","submitted_at":"2026-05-22T10:24:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StepAudio 2.5 is a unified audio-language foundation model that reaches state-of-the-art results on ASR, TTS, and realtime interaction by using task-tailored RLHF on a shared backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23435","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MileStone: A Multi-Objective Compiler Phase Ordering Framework for Graph-based IR-Level Optimization","primary_cat":"cs.PL","submitted_at":"2026-05-22T09:45:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MileStone models compiler phase ordering as a multi-objective optimization problem using graph representations, GNN predictions, and RL agents to find Pareto-optimal pass sequences under user constraints.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23415","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reflex: Reinforcement Learning with Reflection Symmetry Exploitation in State-Based Continuous Control","primary_cat":"cs.LG","submitted_at":"2026-05-22T09:24:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23398","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TPMM-DPO: Trajectory-aware Preference-guided Model Merging for Iterative Direct Preference Optimization","primary_cat":"cs.IR","submitted_at":"2026-05-22T09:11:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TPMM-DPO applies trajectory-aware learned-weight merging of prior policy models to stabilize iterative DPO against preference noise accumulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23365","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Score-Based One-step MeanFlow Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-22T08:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOM is an actor-critic algorithm that constructs the target velocity field for one-step MeanFlow policies directly from the Q-function via score estimation and probability flow ODE, achieving claimed SOTA on locomotion tasks with reduced training and inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23285","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement Learning for Microcanonical Graph Ensemble with Assortativity Constraints","primary_cat":"cs.LG","submitted_at":"2026-05-22T06:57:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DMGG uses reinforcement learning to generate microcanonical graph ensembles with exact assortativity constraints via degree-preserving rewirings, claiming faster generation and better diversity than ERGM approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23261","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniSRM: A Unified Speech Reward Model for Reasoning-Based Fine-grained Assessment","primary_cat":"eess.AS","submitted_at":"2026-05-22T06:02:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"UniSRM is a unified speech reward model with new datasets that uses a two-stage reasoning pipeline to deliver interpretable, human-aligned evaluations across utterance quality to context coherence tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23067","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Training Data Teaches RL Memory Agents: An Empirical Study of Curriculum Effects in Memory-Augmented QA","primary_cat":"cs.CL","submitted_at":"2026-05-21T21:58:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Controlled study shows mixed training curricula improve aggregate F1 on memory QA benchmarks while out-of-domain data transfers targeted skills like temporal reasoning, with per-question-type effects exceeding aggregate differences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22814","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Remember to be Curious: Episodic Context and Persistent Worlds for 3D Exploration","primary_cat":"cs.LG","submitted_at":"2026-05-21T17:58:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A curiosity-based 3D exploration policy that pairs persistent online 3D reconstruction with episodic sequence modeling over RGB to outperform active-mapping baselines on HM3D and transfer zero-shot to Gibson and synthetic worlds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22773","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Reinforcement Learning for Flexible Job Shop Scheduling with Random Job Arrivals","primary_cat":"cs.AI","submitted_at":"2026-05-21T17:33:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A PPO-trained DRL agent selects from established dispatching rules to minimize total job completion time in FJSP with random arrivals, outperforming single rules and performing competitively with arrival-triggered MILP on heterogeneous datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22748","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Superhuman Safe and Agile Racing through Multi-Agent Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-21T17:15:54+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22731","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post-Training is About States, Not Tokens: A State Distribution View of SFT, RL, and On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-21T17:03:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A state distribution view of post-training shows that on-policy supervision from the learner itself can outperform fixed-dataset SFT and preserve retention better than aggressive supervised updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22703","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Clipping Bottleneck: Stabilizing RLVR via Stochastic Recovery of Near-Boundary Signals","primary_cat":"cs.LG","submitted_at":"2026-05-21T16:45:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes Near-boundary Stochastic Rescue (NSR) as a stochastic modification to clipping in RLVR that recovers near-boundary signals and yields gains over baselines like DAPO and GSPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22537","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"F-TIS: Harnessing Diverse Models in Collaborative GRPO","primary_cat":"cs.LG","submitted_at":"2026-05-21T14:25:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"F-TIS enables heterogeneous model collaboration in GRPO by filtering off-policy samples, matching on-policy convergence while improving out-of-distribution performance by up to 12% in some setups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22894","ref_index":71,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCRIPT: Scalable Diffusion Policy with Multi-stage Training for Language-driven Physics-based Humanoid Control","primary_cat":"cs.GR","submitted_at":"2026-05-21T14:17:21+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22513","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Meta-Learning for Rapid Adaptation in Reference Tracking of Uncertain Nonlinear Systems","primary_cat":"cs.AI","submitted_at":"2026-05-21T14:04:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Meta-learning framework adapting iMAML for rapid controller tuning on uncertain nonlinear systems via offline source data and limited online target adaptation, shown with neural state-space and DQN variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22463","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement learning for ion shuttling on trapped-ion quantum computers","primary_cat":"quant-ph","submitted_at":"2026-05-21T13:25:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reinforcement learning optimizes ion shuttling on trapped-ion quantum chips and reduces operations by up to 36.3% versus heuristics across multiple architectures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22446","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pre-VLA: Preemptive Runtime Verification for Reliable Vision-Language-Action and World-Model Rollouts","primary_cat":"cs.CV","submitted_at":"2026-05-21T13:13:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Pre-VLA is a multimodal runtime verifier that predicts safety confidence and advantage scores for action chunks, raising closed-loop success rates on the LIBERO benchmark from 30.79% to 37.62%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22411","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeferMem: Query-Time Evidence Distillation via Reinforcement Learning for Long-Term Memory QA","primary_cat":"cs.CL","submitted_at":"2026-05-21T12:36:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeferMem decouples memory QA into high-recall retrieval and RL-based query-conditioned evidence distillation, outperforming baselines on LoCoMo and LongMemEval-S with highest accuracy, fastest runtime, and zero API token cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22389","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unified Data Selection for LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-21T12:21:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"High-Entropy Sum (HES) selects high-quality reasoning data for LLMs by summing entropy of the top highest-entropy tokens, matching full-dataset performance with top 20% in SFT and outperforming baselines in RFT and RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22306","ref_index":100,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ACCoRD: Actor-Critic Conflict Resolution with Deep learning for O-RAN xApps","primary_cat":"cs.MA","submitted_at":"2026-05-21T10:54:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ACCoRD trains an ANN with PPO-Clip reinforcement learning to select conflict resolution actions in O-RAN, reducing negative network events versus rule-based methods in medium and high traffic simulations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22272","ref_index":71,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Imagine2Real: Towards Zero-shot Humanoid-Object Interaction via Video Generative Priors","primary_cat":"cs.RO","submitted_at":"2026-05-21T10:15:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Imagine2Real enables zero-shot humanoid-object interaction by unifying motions as 4D point trajectories, tracking only base/hands/object keypoints inside a BFM latent space, and training with progressive simple rewards for mocap deployment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22263","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tailoring Teaching to Aptitude: Direction-Adaptive Self-Distillation for LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T10:07:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASD improves math reasoning in LLMs by adaptively directing self-distillation based on per-token entropy to balance exploration and step accuracy, outperforming prior self-distillation and RLVR baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22256","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Emergence of agriculture in an artificial society of reinforcement learning agents","primary_cat":"cs.MA","submitted_at":"2026-05-21T10:00:29+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22240","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unlocking Proactivity in Task-Oriented Dialogue","primary_cat":"cs.AI","submitted_at":"2026-05-21T09:46:25+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22211","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CLORE: Content-Level Optimization for Reasoning Efficiency","primary_cat":"cs.AI","submitted_at":"2026-05-21T09:16:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLORE augments correct on-policy rollouts by deleting repetitive and irrelevant segments then optimizes with auxiliary DPO to improve accuracy-efficiency trade-off on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22195","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforced Graph of Thoughts: RL-Driven Adaptive Prompting for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-21T09:00:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RGoT uses RL to adaptively generate task-specific graphs of operations for GoT-style LLM prompting from a human-provided set, with results suggesting feasibility under constraints.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22177","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Maestro: Reinforcement Learning to Orchestrate Hierarchical Model-Skill Ensembles","primary_cat":"cs.LG","submitted_at":"2026-05-21T08:47:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Maestro uses outcome-based RL to train a lightweight policy that orchestrates ensembles of frozen expert models and skills, reporting 70.1% average accuracy across ten multimodal benchmarks and outperforming GPT-5 and Gemini-2.5-Pro while generalizing to unseen components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22156","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One-Way Policy Optimization for Self-Evolving LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-21T08:25:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OWPO decouples optimization direction from magnitude via asymmetric reweighting (Accelerated Alignment for inferior deviations, Gain Locking for superior) plus iterative references to create a ratchet effect for continuous LLM improvement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22104","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPERA: An Agent for Image Restoration with End-to-End Joint Planning-Execution Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-21T07:40:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OPERA jointly optimizes restoration planning via RL over tool compositions and execution via agent-guided co-training of tools, claiming consistent gains over all-in-one models and prior agent methods on multi-degradation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22082","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoRMA: Contrastive RMA for Contact-Rich Meta-Adaptation","primary_cat":"cs.RO","submitted_at":"2026-05-21T07:21:56+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21993","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ECPO: Evidence-Coupled Policy Optimization for Evidence-Certified Candidate Ranking","primary_cat":"cs.AI","submitted_at":"2026-05-21T04:42:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ECPO is a listwise policy optimization method that couples ranking utility with span-level evidence certificate validity and a deterministic verifier reward on MAVEN-ERE and RAMS datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21883","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Token-weighted Direct Preference Optimization with Attention","primary_cat":"cs.CL","submitted_at":"2026-05-21T01:43:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21851","ref_index":29,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPPO: Bayesian Value Recursion for Token-Level Credit Assignment in LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T00:55:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPPO derives token-level advantages for LLM RL via Bayesian recursion on oracle signals, recovering prior distillation methods as a special case and showing gains on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21800","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"stable-worldmodel: A Platform for Reproducible World Modeling Research and Evaluation","primary_cat":"cs.LG","submitted_at":"2026-05-20T22:58:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper presents stable-worldmodel (swm), a platform with high-performance data layer, modern world model baselines, planning solvers, and extended environments for reproducible research and generalization evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21688","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Closed-Loop Sim-to-Real Reinforcement Learning for Deformable Microfiber Shape Control","primary_cat":"cs.RO","submitted_at":"2026-05-20T19:45:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A closed-loop sim-to-real RL policy trained in a simplified frictionless simulator achieves sub-millimeter microfiber shape control on physical hardware via visual feedback without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21654","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Value-Gradient Hypothesis of RL for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-20T19:09:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Shows that under differentiable rollouts with additive noise, actor updates in critic-free RL for LLMs are value-gradient-like in expectation, motivating a decomposition into value signal and reward headroom for when RL is most effective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21467","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DelTA: Discriminative Token Credit Assignment for Reinforcement Learning from Verifiable Rewards","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:53:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DelTA estimates token coefficients to amplify discriminative directions in token-gradient vectors, reweighting the RLVR surrogate to produce more contrastive side-wise centroids and yielding 3.26 and 2.62 point gains on math benchmarks for 8B and 14B Qwen3 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21442","ref_index":71,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"torchtune: PyTorch native post-training library","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:32:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"torchtune is a modular PyTorch library for LLM post-training that delivers competitive performance and memory efficiency while supporting rapid research iteration through hackable components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21429","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"roto 2.0: The Robot Tactile Olympiad","primary_cat":"cs.RO","submitted_at":"2026-05-20T17:22:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"roto 2.0 provides a standardized benchmark for end-to-end blind tactile RL on 16-24 DOF robots, with open-sourced baselines achieving 13 Baoding ball rotations in 10 seconds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21330","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Robust Dexterous In-Hand Manipulation from Joint Sensors with Proprioceptive Transformer","primary_cat":"cs.RO","submitted_at":"2026-05-20T15:57:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A transformer policy distilled from a privileged RL teacher enables 3.1x faster real-world cube rotation on the ORCA hand using solely joint sensor data by extracting implicit object state from temporal joint patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21311","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeCoR: Design and Control Co-Optimization for Urban Streets Using Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-20T15:39:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeCoR co-optimizes crosswalk placement and signal control via reinforcement learning on a real 750 m urban corridor, reporting 23% faster pedestrian access to crossings and 79%/65% reductions in pedestrian/vehicle wait times versus fixed-time baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21282","ref_index":48,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stochastic MeanFlow Policies: One-Step Generative Control with Entropic Mirror Descent","primary_cat":"cs.LG","submitted_at":"2026-05-20T15:14:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Stochastic MeanFlow Policies enable one-step generative control in off-policy mirror descent by mapping noise through a MeanFlow transform, yielding tractable entropy and improved MuJoCo performance over Gaussian and generative baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21235","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LamPO: A Lambda Style Policy Optimization for Reasoning Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T14:24:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LamPO introduces a pairwise decomposed advantage with confidence-aware weighting to replace scalar group advantages in group-relative policy optimization for reasoning models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21214","ref_index":155,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Behavior-Consistent Deep Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-20T14:08:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QED bounds cross-run KL divergence in Boltzmann policies by setting temperature proportional to Q-disagreement and reduces return variance by two orders of magnitude on 18 continuous-control tasks without performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21195","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RankE: End-to-End Post-Training for Discrete Text-to-Image Generation with Decoder Co-Evolution","primary_cat":"cs.CV","submitted_at":"2026-05-20T13:56:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RankE co-evolves AR policy and decoder via alternating ranking optimization, improving both FID and CLIP scores on LlamaGen-XL and Janus-Pro where policy-only RL degrades FID.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21180","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Domain-Adaptable Reinforcement Learning for Code Generation with Dense Rewards","primary_cat":"cs.LG","submitted_at":"2026-05-20T13:47:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A PPO-based RL framework with execution-aware dense rewards and token-level mapping improves pass@1 by 19% on MBPP and reduces execution failures by 51% on RoboEval for LLM code generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21557","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scalable Reinforcement Learning via Adaptive Batch Scaling","primary_cat":"stat.ML","submitted_at":"2026-05-20T13:46:22+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21168","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ScenePilot: Controllable Boundary-Driven Critical Scenario Generation for Autonomous Driving","primary_cat":"cs.AI","submitted_at":"2026-05-20T13:39:02+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21125","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Advantage Collapse in Group Relative Policy Optimization: Diagnosis and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-05-20T12:57:37+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21123","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Linear-DPO: Linear Direct Preference Optimization for Diffusion and Flow-Matching Generative Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T12:54:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Linear-DPO replaces sigmoid utility with linear utility and adds EMA reference to improve preference alignment in diffusion and flow-matching text-to-image models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21085","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoupling Communication from Policy: Robust MARL under Bandwidth Constraints","primary_cat":"cs.MA","submitted_at":"2026-05-20T12:21:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SLIM decouples inter-agent communication from policy execution in MARL via a dedicated pathway and a normalized bandwidth budget β, yielding robust performance under tight communication limits on standard benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21008","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey of Audio Reasoning in Multimodal Foundation Models","primary_cat":"eess.AS","submitted_at":"2026-05-20T10:44:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A survey that provides a unified formulation of audio reasoning and reviews advances across Audio-to-Text, Audio-to-Speech, Audio-Visual, and Agentic paradigms while discussing challenges and future directions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Supervised Fine-Tuning (SFT) and Alignment Optimization. During SFT, the model is trained on curated instruction- response pairs-often enriched with CoT annotations-to learn the structural logic of following complex audio-related commands. While SFT provides a foundation through demon- stration, subsequent alignment via Preference Optimization or Reinforcement Learning (RL), such as PPO [42] and GRPO [43], further refines the model's behavior. These methods optimize for higher-level objectives such as factual accuracy, reasoning consistency, and faithfulness to the acoustic evi- dence, ensuring the model's outputs align with human intent and logical rigor. IV. AUDIO-TO-TEXTREASONING Audio-to-text reasoning refers to the process of perceiv-"},{"citing_arxiv_id":"2605.20996","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond the Bellman Recursion: A Pontryagin-Guided Framework for Non-Exponential Discounting","primary_cat":"cs.LG","submitted_at":"2026-05-20T10:36:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PG-DPO is a new variational framework that replaces Bellman recursion with a Pontryagin-guided adjoint-MC projection for RL under non-exponential discounting and shows gains on hyperbolic and survival benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20994","ref_index":98,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Context-Invariant Safety Alignment for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T10:33:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces AIR, an asymmetric regularization that anchors open-ended safety prompts to verifiable ones via stop-gradient, improving invariance and accuracy when combined with group preference optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20946","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking-while-speaking: A Controlled, Interleaved Reasoning Method for Real-Time Speech Generation","primary_cat":"cs.CL","submitted_at":"2026-05-20T09:32:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InterRS enables real-time speech generation with interleaved reasoning via a controlled data pipeline, interleaved SFT, and RL using TA-Balance and Linguistic Quality rewards, yielding 13% gains on math and logic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20911","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"For How Long Should We Be Punching? Learning Action Duration in Fighting Games","primary_cat":"cs.AI","submitted_at":"2026-05-20T08:56:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RL agents in fighting games learn to jointly predict actions and their durations, matching fixed frame-skip performance while favoring repeatable exploitative patterns against scripted bots.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"and exploration [1] problems of RL [10,2] due to the increased number of deci- sions that might be responsible for observed performance. However, if the gap between the decision points is too large, the agent becomes slow and potentially unresponsive at crucial moments. We augment the action space inStreet Fighter II - Special Champion Edition, using the FightLadder framework [3] to include a choice as to how many frames to skip. This enables RL agents to learn to autonomously decide how many frames to skip for each action in a state-dependent manner. We trained agents using proximal policy optimization (PPO) [7], training and evaluating against a variety of built-in scripted bots. The key outcomes from our experiments are"},{"citing_arxiv_id":"2605.20865","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Step Likelihood-Ratio Correction for Reinforcement Learning with Verifiable Rewards","primary_cat":"cs.LG","submitted_at":"2026-05-20T08:01:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NFPO augments the PPO surrogate with N-step forward traces to bridge local approximations and exact policy gradients, delivering tighter policy-improvement bounds and improved results on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20863","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PlexRL: Cluster-Level Orchestration of Serviceized LLM Execution for RLVR","primary_cat":"cs.DC","submitted_at":"2026-05-20T07:55:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PlexRL multiplexes unified LLM services across RLVR jobs at the cluster level to exploit anti-correlated idle times and reduce GPU-hour costs by up to 37.58% with minimal per-job overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20834","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Conditional Equivalence of DPO and RLHF: Implicit Assumption, Failure Modes, and Provable Alignment","primary_cat":"cs.AI","submitted_at":"2026-05-20T07:26:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DPO-RLHF equivalence holds only conditionally on the optimal policy preferring human-preferred responses; otherwise DPO optimizes relative advantage and can prefer worse outputs, addressed by introducing CPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20778","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Reinforcement Learning Discovers a Novel Control Algorithm for Mitigating Flow-Induced Vibrations in Underactuated Tandem Cylinders","primary_cat":"physics.flu-dyn","submitted_at":"2026-05-20T06:18:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Deep reinforcement learning discovers high-frequency bang-bang and low-frequency lock-on rotary controls that suppress vibrations in fully and underactuated tandem cylinders by 70-95%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20740","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Distribution-Aware Reward: Reinforcement Learning over Predictive Distributions for LLM Regression","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:43:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Distribution-Aware Reward optimizes LLM regression by treating rollouts as empirical predictive distributions and rewarding marginal improvements in CRPS quality rather than point accuracy alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20722","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AGPO: Adaptive Group Policy Optimization with Dual Statistical Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:20:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AGPO adaptively sets trust-region size and exploration temperature from group reward dispersion, entropy, and KL drift, yielding higher scores than PPO and GRPO on nine math benchmarks under fixed token budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20696","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Distributed Direct Preference Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-20T04:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"First convergence analysis of DPO under federated and decentralized training, characterizing rates via client drift, communication frequency, preference heterogeneity, and graph spectral connectivity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20682","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IndusAgent: Reinforcing Open-Vocabulary Industrial Anomaly Detection with Agentic Tools","primary_cat":"cs.CV","submitted_at":"2026-05-20T03:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IndusAgent achieves state-of-the-art zero-shot performance on industrial anomaly benchmarks by using a custom Indus-CoT dataset, dynamic tool orchestration, and gated RL to optimize anomaly classification, localization, and reasoning.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"consistency, equipping it with a robust and well-calibrated policy initialization for the subsequent reinforcement learning phase. 2.4 Agentic Reinforcement Learning Group Relative Policy Optimization (GRPO).To optimize the agent's decision-making process without the prohibitive memory costs associated with traditional actor-critic architectures [71], we utilize Group Relative Policy Optimization (GRPO) [ 72]. Instead of relying on a separate value network, GRPO evaluates policy updates through a groupwise relative comparison mechanism. Specifically, for a given query image q and its corresponding ground truth a sampled from the dataset D, the system samples a batch of G distinct reasoning trajectories {o1, o2, . . . , oG} using the reference policyπ θold."},{"citing_arxiv_id":"2605.20644","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Design for Manufacturing: A Manufacturability Knowledge-Integrated Reinforcement Learning Framework for Free-Form Pipe Routing in Aeroengines","primary_cat":"cs.LG","submitted_at":"2026-05-20T03:07:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"FPRO applies Frenet-frame RL with curvature-torsion manufacturability constraints and PPO optimization to produce collision-free, fabricable pipe paths for aeroengines, outperforming Cartesian and baseline RL methods in experiments and real fabrication.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20619","ref_index":82,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SURF: Steering the Scalarization Weight to Uniformly Traverse the Pareto Front","primary_cat":"cs.LG","submitted_at":"2026-05-20T02:09:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SURF derives weight sampling rules from the arc-length CDF of the scalarization path to uniformly traverse the Pareto front in multi-objective optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20577","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mahjax: A GPU-Accelerated Mahjong Simulator for Reinforcement Learning in JAX","primary_cat":"cs.AI","submitted_at":"2026-05-20T00:33:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mahjax provides a GPU-accelerated JAX implementation of Riichi Mahjong achieving up to 2 million steps per second and enabling effective tabula rasa reinforcement learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20555","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Complementing reinforcement learning with SFT through logit averaging in the post training of LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-19T23:15:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Logit averaging inside GRPO yields higher or comparable benchmark accuracy to KL-regularized GRPO without using KL terms or a critic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20423","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OSCToM: RL-Guided Adversarial Generation for High-Order Theory of Mind","primary_cat":"cs.AI","submitted_at":"2026-05-19T19:19:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OSCToM uses RL-guided generation with an extended DSL and surrogate models to create nested belief conflict tasks, raising FANToM accuracy from 0.2% to 76% while being 6x more efficient.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20408","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spectral Souping: A Unified Framework for Online Preference Alignment","primary_cat":"cs.LG","submitted_at":"2026-05-19T19:04:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Spectral Souping learns offline specialized policies for fine-grained preferences and merges them online using a discovered universal spectral representation for efficient LLM alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20373","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SUGAR: A Scalable Human-Video-Driven Generalizable Humanoid Loco-Manipulation Learning Framework","primary_cat":"cs.RO","submitted_at":"2026-05-19T18:24:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SUGAR turns diverse human videos into deployable humanoid loco-manipulation policies via automated prior extraction, physics refinement, and hierarchical distillation, showing scaling with data volume and zero-shot real-world transfer on six tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20005","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fine-Tuning Without Forgetting via Loss-Adaptive Learning Rates","primary_cat":"cs.LG","submitted_at":"2026-05-19T15:36:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FINCH is a loss-adaptive learning-rate schedule that reduces forgetting by 93% on average during LLM fine-tuning while matching standard task performance across several benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19940","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robotics-Inspired Guardrails for Foundation Models in Socially Sensitive Domains","primary_cat":"cs.AI","submitted_at":"2026-05-19T15:00:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the Grounded Observer framework that applies robotics-inspired formal constructs for runtime constraint enforcement on foundation model interaction trajectories in socially sensitive domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19909","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fair-Aurora: Comparing Fairness Strategies for Reinforcement Learning-Based Congestion Control in Multi-Flow Environments","primary_cat":"cs.NI","submitted_at":"2026-05-19T14:38:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Compares reward shaping, observation augmentation, and loss-sensitivity tuning as post-hoc fairness fixes for Aurora RL congestion control, finding modest reward shaping best preserves throughput while improving fairness in multi-flow settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19485","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attention-Guided Reward for Reinforcement Learning-based Jailbreak against Large Reasoning Models","primary_cat":"cs.AI","submitted_at":"2026-05-19T07:36:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An attention-guided RL reward combined with diverse persuasion strategies produces higher attack success rates against large reasoning models than prior jailbreak methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19461","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Mode Collapse: Distribution Matching for Diverse Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-19T07:13:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DMPO approximates forward KL minimization in on-policy RL by aligning the policy to a group-level reward-proportional target distribution, yielding 9-12% relative gains over GRPO on NP-Bench and smaller gains on math reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19447","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What and When to Distill: Selective Hindsight Distillation for Multi-Turn Agents","primary_cat":"cs.AI","submitted_at":"2026-05-19T07:00:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SERL selectively reweights learning using task success and environment feedback to reach 90.0% success on ALFWorld and 80.1% on WebShop, outperforming RL and distillation baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19436","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CEPO: RLVR Self-Distillation using Contrastive Evidence Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-19T06:46:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CEPO sharpens token credit in RLVR by requiring tokens to be favored by the correct answer and disfavored by wrong answers drawn from rejected rollouts, delivering accuracy gains on five multimodal math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19425","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When to Stop Reusing: Dynamic Gradient Gating for Sample-Efficient RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-19T06:23:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dynamic Gradient Gating monitors lm_head gradient norms to safely reuse rollout batches in RLVR, achieving up to 2.93x sample efficiency and 2.14x wall-clock speedup across math, ALFWorld, WebShop, and QA tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19416","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LambdaPO: A Lambda Style Policy Optimization for Reasoning Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T06:10:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LambdaPO introduces pairwise preference-based advantage estimation and a semantic density reward to extract more optimization signal from trajectory groups than GRPO's monolithic baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19358","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Taming the Thinker: Conditional Entropy Shaping for Adaptive LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-19T04:41:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CES applies conditional bidirectional entropy control on top of DAPO to improve accuracy and shorten responses on mathematical benchmarks for 7B and 1.5B LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20277","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Regulating Anatomy-Aware Rewards via Trajectory-Integral Feedback for Volumetric Computed Tomography Analysis","primary_cat":"cs.CV","submitted_at":"2026-05-19T04:33:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TIF-GRPO uses integral feedback on pseudo-temporal trajectories to regulate anatomy-aware rewards in RL for clinical faithfulness in volumetric CT analysis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19293","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Domain-Adaptive Communication-Rate Optimization for Sim-to-Real Humanoid-Robot Wireless XR Teleoperation","primary_cat":"cs.IT","submitted_at":"2026-05-19T03:13:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A PAC-Bayes-guided PPO method with density-ratio weighting optimizes communication energy for accurate humanoid robot trajectory reconstruction under sim-to-real shifts in wireless teleoperation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19282","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Muon Beyond Pretraining: Spectral Failures and High-Pass Remedies for VLA and RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-19T03:00:26+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pion modifies Muon's Newton-Schulz iterations into a controllable high-pass filter that anchors dominant singular values at 1 while suppressing noisy tails, outperforming Muon and AdamW in VLA and RLVR regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19235","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GAE Falls Short in Imperfect-Information Self-Play Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-19T01:07:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GAE suffers from amplified variance in imperfect-info self-play RL; VRPO with Q-boosting and multi-step Expected SARSA(λ) reduces it and improves performance on mid-to-large games.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19136","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Automatically Improving Simulation Physics for Articulated Objects","primary_cat":"cs.RO","submitted_at":"2026-05-18T21:34:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A simulator-in-the-loop multi-modal method refines physical properties of incomplete 3D articulated objects to improve simulation stability and downstream robot policy performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19108","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unleashing the Power of Tree-of-Thoughts for Edge-Enabled AIGC Service Provisioning","primary_cat":"cs.DC","submitted_at":"2026-05-18T20:50:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Models ToT prompting as a DAG and introduces DSAC to optimize thought assignment in edge-enabled AIGC, achieving up to 8.32% delay reduction over PPO in simulations while cutting latency over 80% versus local execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18723","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WaveDriver: a Laser Guide Star AO System for HWO","primary_cat":"astro-ph.IM","submitted_at":"2026-05-18T17:50:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"WaveDriver is a laser guide star AO concept whose initial simulations indicate it may be required to meet HWO primary mirror segment stability and low-order wavefront stability requirements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18721","ref_index":3,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"General Preference Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-18T17:50:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GPRL carries a k-dimensional skew-symmetric preference structure into policy updates with per-dimension advantages and a drift monitor, yielding 56.51% length-controlled win rate on AlpacaEval 2.0 from Llama-3-8B-Instruct while outperforming SimPO and SPPO on other benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18675","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COOPO: Cyclic Offline-Online Policy Optimization Algorithm","primary_cat":"cs.LG","submitted_at":"2026-05-18T17:15:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"COOPO is a cyclic offline-online RL algorithm that repeatedly anchors the policy to a dataset via KL-regularized updates then fine-tunes online, claiming better sample efficiency and monotonic improvement under coverage assumptions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}