{"total":26,"items":[{"citing_arxiv_id":"2605.21822","ref_index":34,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Implicit Safety Alignment from Crowd Preferences","primary_cat":"cs.AI","submitted_at":"2026-05-20T23:44:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A hierarchical framework extracts implicit safety criteria from crowd preferences and composes them via high-level policy to reduce safety violations in downstream RL tasks without explicit safety rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21168","ref_index":41,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"ScenePilot: Controllable Boundary-Driven Critical Scenario Generation for Autonomous Driving","primary_cat":"cs.AI","submitted_at":"2026-05-20T13:39:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ScenePilot uses RSS-derived physical feasibility score and online-learned AV-risk predictor in constrained RL with feasibility-aware shielding to generate boundary-band scenarios, yielding +6.2 pp higher collision rates on SafeBench while preserving validity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19592","ref_index":47,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Implicit Action Chunking for Smooth Continuous Control","primary_cat":"cs.RO","submitted_at":"2026-05-19T09:35:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Dual-Window Smoothing uses an execution window for deterministic smoothness and a value window to correct critic bias, plus a first-order temporal regularizer, to achieve smoother RL control than explicit chunking or standard baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17928","ref_index":8,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Transfer Learning for Customized Car Racing Environments","primary_cat":"cs.RO","submitted_at":"2026-05-18T06:37:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"The study applies transfer learning to deep RL in OpenAI car racing, observing that model-based approaches outperform model-free methods and that transfer boosts target domain performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17486","ref_index":146,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DyGRO-VLA: Cross-Task Scaling of Vision-Language-Action Models via Dynamic Grouped Residual Optimization","primary_cat":"cs.RO","submitted_at":"2026-05-17T14:55:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DyGRO-VLA is a two-stage optimization framework for cross-task scaling of Vision-Language-Action models via dynamic grouped residual optimization in RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17431","ref_index":44,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MATE: Solving Contextual Markov Decision Processes with Memory of Accumulated Transition Embeddings","primary_cat":"cs.LG","submitted_at":"2026-05-17T12:52:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MATE uses permutation-invariant sum-aggregated memory of transition embeddings to solve CMDPs with online adaptation and computational advantages over Transformers and RNNs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16720","ref_index":52,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Compositional Adversarial Training for Robust Visual Watermarking","primary_cat":"cs.CV","submitted_at":"2026-05-16T00:07:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAT trains watermark detectors against adaptive compositional adversaries using differentiable attack selection, yielding up to 63.5% capacity gains on hard attacks versus random-augmentation baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15935","ref_index":18,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Dynamic Plasma Shape Control with Arbitrary Sensor Subsets","primary_cat":"cs.RO","submitted_at":"2026-05-15T13:15:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reinforcement learning agent trained in DIII-D tokamak simulator achieves 2.01 cm mean shape error on held-out data, tracks dynamic targets, and remains functional under 30% random sensor dropout with direct transfer to experimental shots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16429","ref_index":4,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"QuantFPFlow: Quantum Amplitude Estimation for Fokker--Planck Policy Optimisation in Continuous Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-14T18:35:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"QuantFPFlow uses quantum amplitude estimation in a Fokker-Planck RL framework to achieve O(1/ε) partition function estimation and reports improved global optimum discovery plus better scaling in continuous control tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14301","ref_index":13,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Language-Induced Priors for Domain Adaptation","primary_cat":"cs.LG","submitted_at":"2026-05-14T03:07:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Language-Induced Priors from LLMs guide source selection in cold-start domain adaptation through an EM algorithm, matching oracle MSE under a correct prior and remaining asymptotically consistent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12379","ref_index":51,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Discrete Flow Matching for Offline-to-Online Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-12T16:44:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT enables stable offline-to-online fine-tuning of CTMC policies in discrete RL via advantage-weighted discrete flow matching, path-space regularization, and candidate-set approximation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12058","ref_index":55,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Holder Policy Optimisation","primary_cat":"cs.LG","submitted_at":"2026-05-12T12:45:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HölderPO unifies token-level aggregation in GRPO via the Hölder mean with a tunable p parameter and annealing schedule, delivering 54.9% average accuracy on math benchmarks and 93.8% success on ALFWorld.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11491","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Understanding and Preventing Entropy Collapse in RLVR with On-Policy Entropy Flow Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-12T04:08:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPEFO prevents entropy collapse in RLVR by rescaling token updates according to their entropy change contributions, yielding more stable optimization and better results on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09824","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Geometric Pareto Control: Riemannian Gradient Flow of Energy Function via Lie Group Homotopy","primary_cat":"eess.SY","submitted_at":"2026-05-11T00:01:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Geometric Pareto Control embeds Pareto solutions in a Lie group submanifold and navigates via Riemannian gradient flow to achieve 100% feasibility and low suboptimality in control tasks without retraining.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"indicators are mapped to urgency potentials via singular perturbation potential functions: ϕi(sk) = exp (ki·δi(sk) ϵi ) −1, k i >0, ϵi∈(0,1](8) wherek i encodes the intrinsic priority of objectiveiandϵi controls the steepness of the potential wall near the constraint boundary. The semantic coordinate is obtained by normalizing the urgency potentials with a positive baselineρ>0: σi(sk) = ϕi(sk) +ρ M∑ j=1 ( ϕj(sk) +ρ ) (9) The baselineρ >0prevents division by zero during nominal operation (allδi = 0⇒allϕi = 0), and sets the default coordinate to the uniform pointσ=1/M(equal economic priority) when no constraint is active. This construction ensuresσk∈∆M−1at all times. Asδi→1, the exponential growth ofϕi causes σi →1continuously (sinceϕi ≫ρnear the constraint boundary), without any threshold or rule-based"},{"citing_arxiv_id":"2605.11020","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Trust Region Inverse Reinforcement Learning: Explicit Dual Ascent using Local Policy Updates","primary_cat":"cs.LG","submitted_at":"2026-05-10T15:32:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRIRL enables explicit dual-ascent IRL via trust-region local policy updates that guarantee monotonic improvement without full RL solves per iteration, outperforming prior imitation methods by 2.4x aggregate IQM and recovering generalizable rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09214","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Fast Rates for Offline Contextual Bandits with Forward-KL Regularization under Single-Policy Concentrability","primary_cat":"cs.LG","submitted_at":"2026-05-09T23:17:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper establishes the first tilde O(epsilon^{-1}) upper bounds and matching lower bounds for forward-KL-regularized offline contextual bandits under single-policy concentrability in both tabular and general function approximation settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07837","ref_index":46,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Approximation-Free Differentiable Oblique Decision Trees","primary_cat":"cs.LG","submitted_at":"2026-05-08T15:04:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DTSemNet gives an exact, invertible neural-network encoding of hard oblique decision trees that supports direct gradient training for both classification and regression without probabilistic softening or quantized estimators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07174","ref_index":40,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Repeated Deceptive Path Planning against Learnable Observer","primary_cat":"cs.AI","submitted_at":"2026-05-08T03:10:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Deceptive Meta Planning (DeMP) uses two-level optimization to sustain deception against learning observers by combining short-term adaptation with meta-level learning of observer updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06825","ref_index":27,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Randomness is sometimes necessary for coordination","primary_cat":"cs.AI","submitted_at":"2026-05-07T18:27:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Structured per-agent randomness via ranked masking in attention allows symmetric agents to break ties and coordinate, achieving perfect success on symmetric tasks where deterministic policies fail and enabling zero-shot transfer across team sizes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06139","ref_index":53,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Listwise Policy Optimization: Group-based RLVR as Target-Projection on the LLM Response Simplex","primary_cat":"cs.LG","submitted_at":"2026-05-07T12:38:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Listwise Policy Optimization explicitly performs target-projection on the LLM response simplex, unifying and improving group-based RLVR methods with monotonic improvement and flexible divergences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05373","ref_index":37,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Neural Co-state Policies: Structuring Hidden States in Recurrent Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-06T18:53:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Recurrent RL policies can have their hidden states aligned with PMP co-states through a derived loss, yielding robust performance on partially observable control tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05172","ref_index":51,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"When Life Gives You BC, Make Q-functions: Extracting Q-values from Behavior Cloning for On-Robot Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-06T17:40:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Q2RL extracts Q-values from a BC policy and applies Q-gating to enable efficient offline-to-online RL, outperforming baselines on D4RL/robomimic tasks and achieving up to 100% success on real-robot manipulation in 1-2 hours.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03065","ref_index":108,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"OGPO: Sample Efficient Full-Finetuning of Generative Control Policies","primary_cat":"cs.LG","submitted_at":"2026-05-04T18:36:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OGPO enables sample-efficient full-finetuning of generative control policies via off-policy critics and modified PPO, achieving SOTA on robot manipulation tasks while rescuing poorly initialized behavior cloning policies without expert data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01862","ref_index":72,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"QHyer: Q-conditioned Hybrid Attention-mamba Transformer for Offline Goal-conditioned RL","primary_cat":"cs.LG","submitted_at":"2026-05-03T13:11:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QHyer replaces return-to-go with a state-conditioned Q-estimator and adds a gated hybrid attention-mamba backbone to achieve state-of-the-art performance in offline goal-conditioned RL on both Markovian and non-Markovian datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19641","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Regulation Zero 2: A Flow-Centric Sequential Regulation Planning Framework to Counter Regulation Cascading in Pre-tactical Air Traffic Flow Management","primary_cat":"math.OC","submitted_at":"2026-04-21T16:30:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Regulation Zero 2 applies hierarchical MCTS with a local proposal engine and FPFS reward estimation to optimize sequences of flow regulations in ATFM, outperforming flight-centric baselines while limiting network impact.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18493","ref_index":33,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Too Correct to Learn: Reinforcement Learning on Saturated Reasoning Data","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:43:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A parameter-free sampling strategy called CUTS combined with Mixed-CUTS training prevents mode collapse in RL for saturated LLM reasoning tasks and raises AIME25 Pass@1 accuracy by up to 15.1% over standard GRPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}