{"total":1917,"items":[{"citing_arxiv_id":"2607.02502","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DemoPSD: Disagreement-Modulated Policy Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-07-02T17:58:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DemoPSD uses a reverse-KL barycenter target modulated by distribution discrepancy for selective teacher guidance in LLM self-distillation, claiming leakage attenuation, exploration preservation, and superior performance on SciKnowEval and GPQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02497","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seek to Segment: Active Perception for Panoramic Referring Segmentation","primary_cat":"cs.CV","submitted_at":"2026-07-02T17:56:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces APRS task and PanoSeeker agent using VLM plus EgoSphere memory for active 360° search and segmentation, outperforming baselines on a new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02479","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EAGLE-360: Embodied Active Global-to-Local Exploration in 360$^\\circ$","primary_cat":"cs.CV","submitted_at":"2026-07-02T17:47:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EAGLE-360 introduces a global-to-local exploration framework for 360° visual search, adapting RoPE Rolling, creating a new VQA dataset, and using SFT+GRPO training to claim SOTA performance with 8x accuracy gain.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02431","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WorldSample: Closed-loop Real-robot RL with World Modelling","primary_cat":"cs.RO","submitted_at":"2026-07-02T17:00:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WorldSample generates synthetic transitions from a post-trained world model grounded in real rollouts and uses Policy-Paced Learning to improve RL policies, reporting 28% higher success rates and 59% fewer training steps on contact-rich robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02407","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Text-Driven 3D Indoor Scene Synthesis in Non-Manhattan Environments","primary_cat":"cs.AI","submitted_at":"2026-07-02T16:40:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"SPG-Layout combines statistical object priors with hierarchical large-object-first placement to produce physically plausible text-driven 3D scenes in non-Manhattan rooms and outperforms baselines on a new 500-scene benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02390","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DecompRL: Solving Harder Problems by Learning Modular Code Generation","primary_cat":"cs.LG","submitted_at":"2026-07-02T16:25:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DecompRL is an RL method that learns modular code decomposition for LLMs, enabling exponential candidate generation via recombination to solve harder coding problems with lower GPU cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02291","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Optimizing Visual Generative Models via Distribution-wise Rewards","primary_cat":"cs.LG","submitted_at":"2026-07-02T15:08:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Distribution-wise rewards with subset-replace strategy and post-hoc merging improve FID-50K on SiT (8.30 to 5.77) and EDM2 (3.74 to 3.52) while preserving diversity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02220","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DetailAnywhere: Fashion Detail Generation via Cross-Modal Feature Alignment Distillation","primary_cat":"cs.CV","submitted_at":"2026-07-02T14:26:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Formalizes Fashion Detail Generation task, releases FDBench benchmark with 40K+ pairs, and proposes CFAD distillation method plus RL consistency reward that outperforms open-source baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02073","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evidence-State Rewards for Long-Context Reasoning","primary_cat":"cs.AI","submitted_at":"2026-07-02T12:11:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Maven is an RL method using answer-conditioned evidence-state values to assign rewards to add, link, and drop actions on evidence memory, outperforming outcome-only baselines on LongBench v2, LongReason, and RULER.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01927","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TUDUM: A Turkish-Thinking Reasoning Pipeline for Qwen3.5-27B","primary_cat":"cs.CL","submitted_at":"2026-07-02T09:22:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"TUDUM applies LoRA-based SFT on 15,991 Turkish reasoning examples followed by GRPO reinforcement learning on Turkish math problems to a 27B Qwen model, producing shorter Turkish reasoning traces with mixed benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01897","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rank-Then-Act: Reward-Free Control from Frame-Order Progress","primary_cat":"cs.LG","submitted_at":"2026-07-02T08:50:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RTA trains a VLM as a progress ordinal scorer via GRPO on shuffled expert frames and uses Spearman rank correlation with temporal indices as a bounded RL reward, matching or exceeding prior video reward methods on discrete and continuous control benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01784","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpaceEra++: A Unified Framework Towards 3D Spatial Reasoning in Video","primary_cat":"cs.CV","submitted_at":"2026-07-02T06:56:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"SpaceEra++ adds ScenePick frame sampling and SpaceAlign pairwise constraints to the prior SpaceEra system, claiming consistent benchmark gains for 3D video spatial reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01764","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mastermind: Strategy-grounded Learning for Repository-Scale Vulnerability Reproduction","primary_cat":"cs.AI","submitted_at":"2026-07-02T06:27:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mastermind's dual-loop planner learns transferable strategies via SFT and milestone GRPO, raising GPT-5.5 executor pass rate on 200 held-out CyberGym tasks from 60% to 84.5%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01707","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LASER: A Corrective Lens for LVLMs via Visual Attention Preservation and Sink Suppression","primary_cat":"cs.CV","submitted_at":"2026-07-02T04:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LASER uses Visual Grounding Reward and Sink Suppression Reward to preserve visual attention trajectories and suppress sink tokens, reducing visual forgetting in LVLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01678","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCAPE: Accurate and Efficient LLM Training with Extreme Sparse Communication","primary_cat":"cs.LG","submitted_at":"2026-07-02T04:10:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCAPE enables 90-99% sparse gradient communication in sharded Adam-style LLM training by deriving masks from first-moment statistics, achieving up to 43.3% faster pre-training on Llama-500M with no loss in validation loss or downstream accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01490","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Don't Let Gains FADE: Breaking Down Policy Gradient Weights in RL","primary_cat":"cs.LG","submitted_at":"2026-07-01T21:39:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FADE is a self-adapting advantage for policy-gradient RL that reads training dynamics to balance positive/negative gradient mass and difficulty focus, yielding faster peak performance and better accuracy-diversity trade-offs than static baselines on LLM reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01480","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Procedural Memory Distillation: Online Reflection for Self-Improving Language Models","primary_cat":"cs.AI","submitted_at":"2026-07-01T21:20:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PMD extracts and distills cross-episode procedural knowledge from RL rollouts into LLM policies at three abstraction levels, yielding 3.8-13.6% gains over SDPO on SCIKNOWEVAL and LIVECODEBENCH via co-evolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01470","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World Feedback for Clinical Agents: Diagnosing RL in FHIR Environments","primary_cat":"cs.AI","submitted_at":"2026-07-01T21:02:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedAgentBench-v3 shows capability ceilings and format-knowledge barriers limit pure RL to 18.2% while rule-based SFT reaches 34.1% on clinical protocol tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01465","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Next-Token Prediction: An RLVR Proof of Concept for Tool-Use Agents on Atlassian Workflows","primary_cat":"cs.AI","submitted_at":"2026-07-01T20:55:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RLVR training on five synthetic Atlassian API environments raises average tool-use reward for Qwen models from 0.35-0.92 to 0.95-1.00 on four non-degenerate scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01455","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Token Geometry","primary_cat":"cs.LG","submitted_at":"2026-07-01T20:21:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Ember is a memory-efficient optimizer for token embeddings that exploits distinct gradient geometry and models token trajectories as 1D rays.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01440","ref_index":70,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FaithMed: Training LLMs For Faithful Evidence-Based Medical Reasoning","primary_cat":"cs.CL","submitted_at":"2026-07-01T20:02:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FaithMed applies reinforcement learning with process-level rewards derived from evidence-based medicine rubrics to improve both task performance and reasoning faithfulness in medical LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01232","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Is One Layer Enough? Training A Single Transformer Layer Can Match Full-Parameter RL Training","primary_cat":"cs.LG","submitted_at":"2026-07-01T17:59:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A single middle transformer layer trained in isolation recovers most RL post-training gains in LLMs, with gains concentrated in middle layers across models, algorithms, and tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01191","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Perceive-to-Reason: Decoupling Perception and Reasoning for Fine-Grained Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-07-01T17:24:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"P2R decouples perception from reasoning in VLMs via a two-stage process and PRA-GRPO alternating RL training, reporting gains such as 93.2% on V-Star for the 4B model over its Qwen3-VL backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01181","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Right in the Right Way: LM Training with Verifiable Rewards and Human Demonstrations","primary_cat":"cs.LG","submitted_at":"2026-07-01T17:13:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adversarial training combines verifiable RL rewards with a discriminator proxy for human outputs to jointly optimize accuracy and non-verifiable qualities like naturalness in language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01170","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Diffusion-GR2: Diffusion Generative Reasoning Re-ranker","primary_cat":"cs.IR","submitted_at":"2026-07-01T17:02:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Diffusion-GR2 converts an AR reasoning re-ranker to block-diffusion via CFT, OPD, and RL stages, recovering near-parity accuracy on Amazon Beauty with 2.4-3.5x decode speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01083","ref_index":5,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Staleness-Learning Rate Scaling Laws for Asynchronous RLHF","primary_cat":"cs.LG","submitted_at":"2026-07-01T15:40:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Stale rollouts introduce O(S * eta) surrogate-gradient bias in async GRPO, yielding stability condition eta << min{R_batch / (S * G_upd), R_crit / (T * G_upd)} under smoothness assumptions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01050","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoSearcher: Anchor-Guided Progressive Reasoning for Remote Sensing Visual Grounding with Process Supervision","primary_cat":"cs.CV","submitted_at":"2026-07-01T15:12:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoSearcher introduces anchor-centric reasoning supervised fine-tuning and process-faithful group relative policy optimization to improve MLLM-based remote sensing visual grounding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00924","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Graph-Native Reinforcement Learning Enables Traceable Scientific Hypothesis Generation through Conceptual Recombination","primary_cat":"cs.AI","submitted_at":"2026-07-01T13:26:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Graph-PRefLexOR fine-tunes graph-native models with GRPO to organize reasoning into phases, yielding 40-65% gains in traceable hypothesis generation and 2-3x semantic diversity on 100 materials science questions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00881","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniView-Space: Reinforcing Spatial Reasoning via Multi-Perspective Spatial Mapping","primary_cat":"cs.CV","submitted_at":"2026-07-01T12:45:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OmniView-Space framework with MPSM, tool-guided reasoning, and distillation achieves SOTA on spatial reasoning benchmarks for MLLMs while reducing external geometry dependencies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00867","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EFlow: Learning Evidence Flow for Long-Video Reasoning with Adaptive Reflection","primary_cat":"cs.CV","submitted_at":"2026-07-01T12:32:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EFlow separates temporal grounding from logical reasoning via two CoT stages and adds confidence-aware reflection, trained via SFT and RL on custom trajectory data, yielding gains on five video benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00535","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Flow-Map GRPO: Reinforcement Learning for Few-Step Flow-Map Generators via Anchored Stochastic Composition","primary_cat":"cs.LG","submitted_at":"2026-07-01T07:25:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Flow-Map GRPO uses anchored stochastic flow map composition to enable GRPO-based RL alignment of deterministic few-step flow-map generators while preserving their marginal paths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00531","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Active-GRPO: Adaptive Imitation and Self-Improving Reasoning for Molecular Optimization","primary_cat":"cs.LG","submitted_at":"2026-07-01T07:22:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Active-GRPO reaches 0.1773 average SRxSim on TOMG-Bench MOLOPT by adaptively switching between imitation and self-reinforcement while upgrading references, outperforming GRPO and RePO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00482","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Know When to Stop: Segment-Level Credit Assignment for Reducing Overthinking","primary_cat":"cs.CL","submitted_at":"2026-07-01T06:09:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DASH assigns segment-level credit in reasoning traces using drift toward ground-truth answers, yielding 50.8% accuracy on AIME25 versus 45.4% for GRPO while reducing overthinking behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00465","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StochasT: Learning with Stochastic Turn Depth for Visual Instruction Tuning","primary_cat":"cs.CV","submitted_at":"2026-07-01T05:34:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StochasT uses stochastic clustering of language tasks into varying turn depths for the same image to improve LVLMs on both single-turn and multi-turn scenarios without discarding data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00446","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VideoSearch-R1: Iterative Video Retrieval and Reasoning via Soft Query Refinement","primary_cat":"cs.CV","submitted_at":"2026-07-01T04:59:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VideoSearch-R1 achieves SOTA on VCMR across three datasets via iterative retrieval, latent-space soft query refinement, and GRPO training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00407","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Personalization as Inverse Planning: Learning Latent Design Intents for Agentic Slide Generation via Structural Denoising","primary_cat":"cs.AI","submitted_at":"2026-07-01T04:05:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SPIRE approximates page-level slide personalization by training agents to denoise corrupted slide structures via collaborative RL, claiming a proof of consistency as a surrogate for inverse planning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00361","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReShift: Aha-Moment-Driven Reasoning-Level Backdoor Attacks on Vision-Language Models","primary_cat":"cs.CR","submitted_at":"2026-07-01T02:59:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReShift is a reasoning-level backdoor framework for VLMs that uses poisoned data construction and joint optimization to shift CoT trajectories on trigger while preserving surface coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00260","ref_index":14,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do Multimodal Large Language Models Need Reasoning to Classify Dementia from Speech?","primary_cat":"eess.AS","submitted_at":"2026-06-30T23:12:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeTAiL uses internal representations from reasoning MLLMs via an adaptor and RL to outperform text-rationale methods and baselines for speech-based dementia classification on two datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00208","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SLIM-RL: Risk-Budgeted Random-Masking RL for Diffusion LLMs Without Trajectory Slicing","primary_cat":"cs.CL","submitted_at":"2026-06-30T21:38:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SLIM-RL matches or exceeds TraceRL performance on MATH500, GSM8K, MBPP and HumanEval for diffusion LLMs by risk-budgeted random-masking RL without trajectory slicing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00164","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Verifiable Rewards for Calibrated Probabilistic Forecasting","primary_cat":"cs.LG","submitted_at":"2026-06-30T20:42:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A verifiable empirical win rate reward combined with gradient masking enables RL training of a 7B model to reach betting-market calibration on NFL win probabilities using only outcome data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00152","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GRPO, Dr. GRPO, and DAPO Are Three Operations on One Number: The Group-Standard-Deviation Identity","primary_cat":"cs.LG","submitted_at":"2026-06-30T20:28:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GRPO, Dr. GRPO, and DAPO are three settings of one dial on the group standard deviation of binary rewards, unified by the group-standard-deviation identity where disagreement equals update magnitude.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00147","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RareDxR1: Autonomous Medical Reasoning for Rare Disease Diagnosis Beyond Human Annotation","primary_cat":"cs.AI","submitted_at":"2026-06-30T20:25:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RareDxR1 is an LLM trained via knowledge internalization, reflection-enhanced sampling, and curriculum RL to perform open-domain rare disease diagnosis from raw notes without human-labeled phenotypes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32039","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GEAR: Guided End-to-End AutoRegression for Image Synthesis","primary_cat":"cs.CV","submitted_at":"2026-06-30T17:59:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GEAR jointly trains VQ tokenizer and AR generator end-to-end via dual hard/soft read-out and representation alignment, achieving up to 10x faster ImageNet gFID convergence than LlamaGen-REPA while generalizing across quantizers and to text-to-image.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32034","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QVal: Cheaply Evaluating Dense Supervision Signals for Long-Horizon LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-06-30T17:58:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QVal is a new evaluation framework that directly measures dense supervision quality via Q-alignment to a reference policy, showing simple prompting baselines outperform 21 other methods across environments and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32032","ref_index":82,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement Learning with Metacognitive Feedback Elicits Faithful Uncertainty Expression in LLMs","primary_cat":"cs.CL","submitted_at":"2026-06-30T17:56:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RLMF uses quality of model self-judgments to refine RL rankings and select training data, achieving SOTA faithful calibration while preserving accuracy and outperforming standard RL by up to 63%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32017","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRIAGE: Role-Typed Credit Assignment for Agentic Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-30T17:48:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRIAGE augments GRPO with role-typed segment rewards derived from a judge that detects regression and exploration, yielding higher success rates and fewer turns on ALFWorld, Search-QA, and WebShop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32012","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoMet: Context and Multiplicity Decomposition for Multimodal Uncertainty Estimation","primary_cat":"cs.LG","submitted_at":"2026-06-30T17:46:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoMet decomposes MLLM uncertainty into context-specific and multiplicity-specific terms estimated by a trained post-hoc module, improving performance on open-ended multimodal benchmarks and hallucination detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31986","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoLT: Teaching Multi-Modal Models to Think with Chain of Latent Thoughts","primary_cat":"cs.CV","submitted_at":"2026-06-30T17:24:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoLT replaces text-based chain-of-thought in MLLMs with 3-step latent thought chains supervised by a removable external decoder in forward and backward modes, yielding 10.1x faster inference on eight benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31984","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GR2 Technical Report","primary_cat":"cs.IR","submitted_at":"2026-06-30T17:22:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GR2 applies mid-training on semantic IDs, reasoning distillation, RL with conditional verifiable rewards, and a context compressor to re-ranking in industrial recsys, reporting +18.7% R@1 over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31732","ref_index":120,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniCoder: Unified Visual-to-Code Generation via Symbolic Rewards and Reference-Guided Code Optimization","primary_cat":"cs.CV","submitted_at":"2026-06-30T14:29:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniCoder applies symbolic attribute alignment via an auxiliary LLM and reference-guided optimization in RL to achieve SOTA visual-to-code generation on ChartMimic, UniSVG, Design2Code, and ScreenBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}