{"total":127,"items":[{"citing_arxiv_id":"2605.13536","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HLS-Seek: QoR-Aware Code Generation for High-Level Synthesis via Proxy Comparative Reward Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-13T13:47:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HLS-Seek replaces full-synthesis RL with a comparative proxy reward model plus uncertainty-triggered real checks, yielding higher correctness and better QoR than larger models at 8.5x lower training cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13534","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scaling Retrieval-Augmented Reasoning with Parallel Search and Explicit Merging","primary_cat":"cs.AI","submitted_at":"2026-05-13T13:46:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MultiSearch uses parallel multi-query retrieval plus explicit merging inside a reinforcement-learning loop to improve retrieval-augmented reasoning, outperforming baselines on seven QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13501","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reward-Weighted On-Policy Distillation with an Open Property-Equivalence Verifier for NL-to-SVA Generation","primary_cat":"cs.AR","submitted_at":"2026-05-13T13:23:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reward-Weighted On-Policy Distillation with an open property-equivalence verifier produces a 7B model that surpasses prior SOTA on NL-to-SVA generation across pass@1/5/10 metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13292","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IndicMedDialog: A Parallel Multi-Turn Medical Dialogue Dataset for Accessible Healthcare in Indic Languages","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:06:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A parallel multi-turn medical dialogue dataset spanning English and nine Indic languages is created from synthetic consultations to enable personalized AI healthcare interactions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13026","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding and Accelerating the Training of Masked Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-13T05:29:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Bell-shaped time sampling accelerates masked diffusion language model training by roughly 4x on LM1B by countering locality bias in language data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12975","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Retrieval is Cheap, Show Me the Code: Executable Multi-Hop Reasoning for Retrieval-Augmented Generation","primary_cat":"cs.AI","submitted_at":"2026-05-13T04:14:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PyRAG turns multi-hop reasoning into executable Python code over retrieval tools for explicit, verifiable step-by-step RAG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12947","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Should an AI Workflow Release? Always-Valid Inference for Black-Box Generate-Verify Systems","primary_cat":"stat.ML","submitted_at":"2026-05-13T03:30:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A wrapper for black-box generate-verify AI pipelines that uses a conservative hard-negative reference pool and e-processes to control the probability of releasing on infeasible tasks while permitting release on feasible ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12384","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scalable Token-Level Hallucination Detection in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T16:47:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TokenHD uses a scalable data synthesis engine and importance-weighted training to create token-level hallucination detectors that work on free-form text and scale from 0.6B to 8B parameters, outperforming larger reasoning models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12237","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UHR-Micro: Diagnosing and Mitigating the Resolution Illusion in Earth Observation VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-12T15:07:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VLMs show a resolution illusion on UHR Earth observation imagery where higher resolution does not improve micro-target perception; UHR-Micro benchmark and MAP-Agent address this via evidence-centered active inspection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12227","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Combining On-Policy Optimization and Distillation for Long-Context Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:04:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"dGRPO merges outcome-based policy optimization with dense teacher guidance from on-policy distillation, yielding more stable long-context reasoning on the new LongBlocks synthetic dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12201","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Uncertainty Quantification for LLM-based Code Generation","primary_cat":"cs.SE","submitted_at":"2026-05-12T14:40:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RisCoSet applies multiple hypothesis testing to construct risk-controlling partial-program prediction sets for LLM code generation, achieving up to 24.5% less code removal than prior methods at equivalent risk levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11974","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Order Fairness: Mitigating LLMs Order Sensitivity through Dual Group Advantage Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-12T11:31:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DGAO uses reinforcement learning to optimize LLMs for both accuracy and order stability by balancing intra-group accuracy advantages and inter-group stability advantages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11922","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StepCodeReasoner: Aligning Code Reasoning with Stepwise Execution Traces via Reinforcement Learning","primary_cat":"cs.SE","submitted_at":"2026-05-12T10:36:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StepCodeReasoner aligns code reasoning with verifiable stepwise execution traces via print anchors and bi-level GRPO reinforcement learning, reaching SOTA results on CRUXEval (91.1%) and LiveCodeBench (86.5%) for a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10674","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Step Rejection Fine-Tuning: A Practical Distillation Recipe","primary_cat":"cs.LG","submitted_at":"2026-05-11T14:55:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step Rejection Fine-Tuning masks loss on erroneous steps identified by a critic LLM in unresolved trajectories, raising SWE-bench Verified resolution rate by 3.7% to 32.2% versus 2.4% for trajectory-level rejection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10187","ref_index":67,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SciVQR: A Multidisciplinary Multimodal Benchmark for Advanced Scientific Reasoning Evaluation","primary_cat":"cs.CV","submitted_at":"2026-05-11T08:38:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SciVQR is a new multimodal benchmark covering 54 scientific subfields that evaluates MLLMs on visual comprehension and multi-step reasoning, revealing significant limitations in leading models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10032","ref_index":54,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PlantMarkerBench: A Multi-Species Benchmark for Evidence-Grounded Plant Marker Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-11T05:57:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PlantMarkerBench supplies 5,550 literature sentences annotated for plant marker gene evidence validity and type across Arabidopsis, maize, rice and tomato, showing frontier LLMs handle direct expression evidence but struggle with functional, indirect and weak-support cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09594","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Trust Me, Import This: Dependency Steering Attacks via Malicious Agent Skills","primary_cat":"cs.CR","submitted_at":"2026-05-10T15:13:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Malicious Skills induce coding agents to hallucinate and import attacker-controlled packages at high rates while evading detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09505","ref_index":48,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EpiGraph: Building Generalists for Evidence-Intensive Epilepsy Reasoning in the Wild","primary_cat":"cs.AI","submitted_at":"2026-05-10T12:27:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EpiGraph creates a heterogeneous epilepsy knowledge graph that boosts LLM performance on clinical reasoning tasks by 30-41% in pharmacogenomics when used with Graph-RAG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09188","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DARE: Difficulty-Adaptive Reinforcement Learning with Co-Evolved Difficulty Estimation","primary_cat":"cs.LG","submitted_at":"2026-05-09T22:05:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DARE co-evolves difficulty estimation and policy in RL for LLMs to improve training efficiency, final performance, and inference speed by using tailored strategies for different difficulty levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09134","ref_index":84,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BoostAPR: Boosting Automated Program Repair via Execution-Grounded Reinforcement Learning with Dual Reward Models","primary_cat":"cs.AI","submitted_at":"2026-05-09T19:31:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BoostAPR boosts automated program repair by training a sequence-level assessor and line-level credit allocator from execution outcomes, then applying them in PPO to reach 40.7% on SWE-bench Verified.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08686","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Iterative Critique-and-Routing Controller for Multi-Agent Systems with Heterogeneous LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-09T04:51:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A critique-and-routing controller cast as a finite-horizon MDP with policy-gradient optimization outperforms one-shot routing baselines on reasoning benchmarks while using the strongest agent for under 25% of calls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08568","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Different Prompts, Different Ranks: Prompt-aware Dynamic Rank Selection for SVD-based LLM Compression","primary_cat":"cs.LG","submitted_at":"2026-05-09T00:02:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PARSE trains a prompt-aware linear router on dense-model outputs to select dynamic SVD ranks, improving accuracy up to 10% at 0.6 compression ratio on LLaMA-7B while delivering 2.5x prefill and 2.4x decode speedups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08382","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SecureForge: Finding and Preventing Vulnerabilities in LLM-Generated Code via Prompt Optimization","primary_cat":"cs.CR","submitted_at":"2026-05-08T18:40:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SecureForge audits LLM code for vulnerabilities, builds a synthetic prompt corpus via Markovian sampling, and optimizes system prompts to cut security issues by up to 48% while preserving unit test performance, with zero-shot transfer to real prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08327","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Interactive Critique-Revision Training for Reliable Structured LLM Generation","primary_cat":"cs.LG","submitted_at":"2026-05-08T17:00:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DPA-GRPO trains a generator-verifier pair via group-relative policy optimization on paired counterfactual actions, improving structured output accuracy on TaxCalcBench over zero-shot and generator-only baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07961","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Graph Representation Learning Augmented Model Manipulation on Federated Fine-Tuning of LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-08T16:24:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Graph representation learning plus iterative augmented Lagrangian optimization creates stronger, harder-to-detect model manipulation attacks on federated LLM fine-tuning, cutting global accuracy by up to 26%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07775","ref_index":103,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"POETS: Uncertainty-Aware LLM Optimization via Compute-Efficient Policy Ensembles","primary_cat":"cs.LG","submitted_at":"2026-05-08T14:16:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"POETS uses compute-efficient LLM policy ensembles to implicitly perform KL-regularized Thompson sampling, delivering O(sqrt(T gamma_T)) regret bounds and state-of-the-art sample efficiency in scientific discovery tasks such as protein search and quantum circuit design.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07345","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mean-Pooled Cosine Similarity is Not Length-Invariant: Theory and Cross-Domain Evidence for a Length-Invariant Alternative","primary_cat":"cs.CL","submitted_at":"2026-05-08T06:48:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mean-pooled cosine similarity grows with sequence length in anisotropic transformer embeddings independent of content, while CKA shows far less length dependence across code, translation, and vision tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07342","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mage: Multi-Axis Evaluation of LLM-Generated Executable Game Scenes Beyond Compile-Pass Rate","primary_cat":"cs.LG","submitted_at":"2026-05-08T06:46:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mage shows compile-pass rate is anti-correlated with functional correctness in LLM game scene generation; direct NL-to-C# yields 43% runtime but F1~0.12 structure, while IR conditioning recovers structure (F1 up to 1.0) but halves runtime, with granularity levels statistically equivalent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07248","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PaT: Planning-after-Trial for Efficient Test-Time Code Generation","primary_cat":"cs.CL","submitted_at":"2026-05-08T05:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaT defers planning until after failed trials in LLM code generation, enabling heterogeneous cheap-plus-powerful model setups that match large-model performance at roughly 69% lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06761","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Weblica: Scalable and Reproducible Training Environments for Visual Web Agents","primary_cat":"cs.AI","submitted_at":"2026-05-07T17:17:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Weblica scales RL training for visual web agents by building thousands of reproducible environments through HTTP caching for stable replays and LLM synthesis from real sites, yielding an 8B model that beats similar open baselines on navigation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06111","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Schedule-and-Calibrate: Utility-Guided Multi-Task Reinforcement Learning for Code LLMs","primary_cat":"cs.SE","submitted_at":"2026-05-07T12:24:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ASTOR improves a single code LLM across four tasks by 9.0-9.5% over the best specialist and 7.5-12.8% over prior multi-task RL baselines via utility-driven data scheduling and adaptive KL regularization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06046","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Requests of a Feather Must Flock Together: Batch Size vs. Prefix Homogeneity in LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-05-07T11:34:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Feather uses reinforcement learning and a Chunked Hash Tree to balance batch size against prefix homogeneity in LLM inference, delivering 2-10x higher throughput than existing schedulers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05833","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Role of Language Representations in Auto-Bidding: Findings and Implications","primary_cat":"cs.AI","submitted_at":"2026-05-07T08:08:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SemBid injects LLM-encoded Task, History, and Strategy semantics as tokens into offline bidding trajectories and uses self-attention to outperform numerical-only baselines in performance, constraint satisfaction, and robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05741","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HyperLens: Quantifying Cognitive Effort in LLMs with Fine-grained Confidence Trajectory","primary_cat":"cs.AI","submitted_at":"2026-05-07T06:32:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HyperLens reveals that deeper transformer layers magnify small confidence changes into fine-grained trajectories, allowing quantification of cognitive effort where complex tasks demand more and standard SFT can reduce it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05285","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attribution-Guided Continual Learning for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-06T17:38:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An attribution-based continual learning framework for LLMs modulates per-parameter gradients using task-specific importance scores to reduce forgetting of prior tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04980","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Conceptors for Semantic Steering","primary_cat":"cs.LG","submitted_at":"2026-05-06T14:32:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Conceptors as soft projection matrices from bipolar activations offer a multidimensional, compositional, and geometrically principled method for semantic steering in LLMs that outperforms single-vector baselines in multi-dimensional subspaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04903","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Delta-Based Neural Architecture Search: LLM Fine-Tuning via Code Diffs","primary_cat":"cs.LG","submitted_at":"2026-05-06T13:32:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Fine-tuned 7B LLMs generating unified diffs for neural architecture refinement achieve 66-75% valid rates and 64-66% mean first-epoch accuracy, outperforming full-generation baselines by large margins while cutting output length by 75-85%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04894","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SynConfRoute: Syntax-Aware Routing for Efficient Code Completion with Small CodeLLMs","primary_cat":"cs.SE","submitted_at":"2026-05-06T13:25:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SynConfRoute routes code completions using syntax validation and token confidence, improving pass@1 by up to 31% on hard tasks and reducing accelerator usage by 58% versus always using the largest model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04763","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Does Chunking Affect Retrieval-Augmented Code Completion? A Controlled Empirical Study","primary_cat":"cs.SE","submitted_at":"2026-05-06T11:09:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Function-based chunking underperforms other strategies in RAG code completion by 3.57-5.64 points, with context length as the dominant factor.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03849","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stream-R1: Reliability-Perplexity Aware Reward Distillation for Streaming Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-05T15:15:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Stream-R1 improves distillation of autoregressive streaming video diffusion models by adaptively weighting supervision with a reward model at both rollout and per-pixel levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03179","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Validated Prompt Bank for Malicious Code Generation: Separating Executable Weapons from Security Knowledge in 1,554 Consensus-Labeled Prompts","primary_cat":"cs.CR","submitted_at":"2026-05-04T21:42:10+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper releases a 1,554-prompt consensus-labeled bank separating executable malicious code requests from security knowledge requests, validated by five-model majority labeling with Fleiss' kappa of 0.876.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03117","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ARISE: A Repository-level Graph Representation and Toolset for Agentic Fault Localization and Program Repair","primary_cat":"cs.SE","submitted_at":"2026-05-04T19:59:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ARISE adds a data-flow-augmented repository graph and three-tier tool API to LLM agents, raising Function Recall@1 by 17 points, Line Recall@1 by 15 points, and Pass@1 repair rate to 22% on SWE-bench Lite.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02860","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Standing on the Shoulders of Giants: Stabilized Knowledge Distillation for Cross--Language Code Clone Detection","primary_cat":"cs.AI","submitted_at":"2026-05-04T17:37:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Reasoning-oriented knowledge distillation from DeepSeek-R1 plus response stabilization improves reliability and often performance of compact models for cross-language code clone detection on pairs like Python-Java and Rust-Java.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02859","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Sensors to Insight: Rapid, Edge-to-Core Application Development for Sensor-Driven Applications","primary_cat":"cs.DC","submitted_at":"2026-05-04T17:36:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A pattern-based workflow engineering approach with AI assistance enables rapid development of sensor-driven applications across edge-to-core infrastructures, shown via reusable templates in environmental monitoring case studies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01394","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LiveFMBench: Unveiling the Power and Limits of Agentic Workflows in Specification Generation","primary_cat":"cs.SE","submitted_at":"2026-05-02T11:31:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LiveFMBench shows that direct LLM prompting for C program formal specs overestimates accuracy by ~20% due to unfaithful behaviors like deceiving provers, while agentic workflows help under low sampling but overall performance remains far below human-authored specs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00433","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving LLM Code Generation via Requirement-Aware Curriculum Reinforcement Learning","primary_cat":"cs.SE","submitted_at":"2026-05-01T06:10:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"REC RL improves LLM code generation by automatically assessing and optimizing requirement difficulty with adaptive curriculum sampling, yielding 1.23-5.62% Pass@1 gains over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00425","ref_index":41,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AEM: Adaptive Entropy Modulation for Multi-Turn Agentic Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-01T05:54:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AEM adaptively modulates response-level entropy in agentic RL to improve credit assignment and exploration-exploitation balance, yielding gains on ALFWorld, WebShop, and SWE-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27115","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Exploring the Limits of Pruning: Task-Specific Neurons, Model Collapse, and Recovery in Task-Specific Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-29T19:08:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Selective pruning of low-activation neurons in task-specific LLMs preserves accuracy better than random pruning, but removing roughly 10% of highly selective neurons triggers total collapse, with fine-tuning recovering much of the lost performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25903","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Carbon-Taxed Transformers: A Green Compression Pipeline for Overgrown Language Models","primary_cat":"cs.SE","submitted_at":"2026-04-28T17:48:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"CTT is a compression pipeline for LLMs that achieves up to 49x memory reduction, 10x faster inference, 81% lower CO2 emissions, and retains 68-98% accuracy on code clone detection, summarization, and generation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25599","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PLMGH: What Matters in PLM-GNN Hybrids for Code Classification and Vulnerability Detection","primary_cat":"cs.SE","submitted_at":"2026-04-28T13:05:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Controlled experiments show PLM-GNN hybrids improve code tasks over GNN-only baselines, with PLM source having larger impact than GNN backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}