{"total":61,"items":[{"citing_arxiv_id":"2605.13779","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MinT: Managed Infrastructure for Training and Serving Millions of LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-13T16:59:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MinT enables efficient management of million-scale LoRA-adapted LLM policies over shared 1T-parameter base models by moving only small adapters through training and serving pipelines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13643","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prefix Teach, Suffix Fade: Local Teachability Collapse in Strong-to-Weak On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:05:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Local teachability collapse in trajectory suffixes makes uniform dense supervision suboptimal in strong-to-weak OPD; truncating at BIC-style change points on teacher margin improves performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13139","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SWE-Cycle: Benchmarking Code Agents across the Complete Issue Resolution Cycle","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12882","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CiteVQA: Benchmarking Evidence Attribution for Trustworthy Document Intelligence","primary_cat":"cs.CL","submitted_at":"2026-05-13T01:54:42+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"CiteVQA requires models to cite specific document regions with bounding boxes alongside answers and finds that even the strongest MLLMs frequently cite the wrong region, with top SAA scores of only 76.0 for closed models and 22.5 for open-source ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12481","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ToolCUA: Towards Optimal GUI-Tool Path Orchestration for Computer Use Agents","primary_cat":"cs.AI","submitted_at":"2026-05-12T17:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToolCUA introduces a trajectory scaling pipeline and staged RL to optimize GUI-tool switching, reaching 46.85% accuracy on OSWorld-MCP for a 66% relative gain over baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12070","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Missing Old Logits in Asynchronous Agentic RL: Semantic Mismatch and Repair Methods for Off-Policy Correction","primary_cat":"cs.LG","submitted_at":"2026-05-12T12:57:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Missing old logits in async agentic RL entangle discrepancy and staleness terms in PPO off-policy correction; exact acquisition methods and revised PPO-EWMA restore decoupled updates with reported gains in speed and performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10912","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WildClawBench: A Benchmark for Real-World, Long-Horizon Agent Evaluation","primary_cat":"cs.CL","submitted_at":"2026-05-11T17:49:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"A new native-runtime benchmark reveals that current frontier AI agents succeed on at most 62 percent of realistic long-horizon CLI tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10876","ref_index":85,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AssayBench: An Assay-Level Virtual Cell Benchmark for LLMs and Agents","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:27:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AssayBench is a new gene-ranking benchmark for phenotypic CRISPR screens that shows zero-shot generalist LLMs outperform both biology-specific LLMs and trainable baselines on adjusted nDCG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10442","ref_index":44,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StereoTales: A Multilingual Framework for Open-Ended Stereotype Discovery in LLMs","primary_cat":"cs.CY","submitted_at":"2026-05-11T12:12:28+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StereoTales shows that all tested LLMs emit harmful stereotypes in open-ended stories, with associations adapting to prompt language and targeting locally salient groups rather than transferring uniformly across languages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10379","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Not All Proofs Are Equal: Evaluating LLM Proof Quality Beyond Correctness","primary_cat":"cs.CL","submitted_at":"2026-05-11T11:23:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM proofs for hard math problems show large differences in quality metrics like conciseness and cognitive simplicity that correctness-only tests miss, along with trade-offs between quality and correctness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10365","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent-ValueBench: A Comprehensive Benchmark for Evaluating Agent Values","primary_cat":"cs.AI","submitted_at":"2026-05-11T11:09:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Agent-ValueBench is the first dedicated benchmark for agent values, showing they diverge from LLM values, form a homogeneous 'Value Tide' across models, and bend under harnesses and skill steering.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"We benchmark14proprietary and open-weights frontier models drawn from10families, spanning two paradigms: Proprietary, including Claude Haiku 4.5 [61], Claude Sonnet 4.6 [62], Gemini 3 Flash Preview [63], Gemini 3.1 Pro Preview [64], GPT-5.4 [65], GPT-5.4 Mini [66], and Grok 4.20 [67]; and Open-weights, including DeepSeek V3.2 [68], GLM 5.1 [69], Kimi K2.5 [70], Llama 3.3 70B Instruct [71], MiniMax M2.7 [72], and Qwen3 30B A3B [73], Qwen3.5 397B A17B [74]. Harnesses.We consider four mainstream harnesses, namely vanilla ReAct [4], Codex [75], Claude Code [76], and OpenClaw [7]. Metrics.Following prior work [19, 20, 30, 77], we characterize agent values along two orthogonal axes. Value Adherencecaptures how strongly an agent conforms to a given value dimension across its"},{"citing_arxiv_id":"2605.10189","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProteinOPD: Towards Effective and Efficient Preference Alignment for Protein Design","primary_cat":"cs.LG","submitted_at":"2026-05-11T08:38:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProteinOPD uses token-level on-policy distillation from multiple preference-specific teacher models into a shared student to balance competing objectives in protein design, delivering gains on targets without losing designability and an 8x speedup over RL baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09725","ref_index":11,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Distillation with Best-of-N Teacher Rollout Selection","primary_cat":"cs.CV","submitted_at":"2026-05-10T19:49:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BRTS improves on-policy distillation by sampling multiple teacher rollouts and selecting the best one via a correctness-first then alignment priority rule, yielding gains on AIME and AMC math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08766","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UserGPT Technical Report","primary_cat":"cs.IR","submitted_at":"2026-05-09T07:51:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UserGPT introduces a generative LLM framework with a behavior simulation engine, semantization module, and DF-GRPO post-training that scores 0.7325 on tag prediction and 0.7528 on summary generation on HPR-Bench while compressing records by up to 97.9%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08741","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training with Harnesses: On-Policy Harness Self-Distillation for Complex Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-09T07:06:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPHSD uses harness-augmented models as teachers to distill reasoning capabilities into base LLMs, yielding strong standalone performance on classification and math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08063","ref_index":19,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Flow-OPD: On-Policy Distillation for Flow Matching Models","primary_cat":"cs.CV","submitted_at":"2026-05-08T17:50:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Flow-OPD applies on-policy distillation to flow matching models, achieving GenEval of 92 and OCR accuracy of 94 on Stable Diffusion 3.5 Medium while avoiding the seesaw effect of multi-reward optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08013","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Learning CLI Agents with Structured Action Credit under Selective Observation","primary_cat":"cs.AI","submitted_at":"2026-05-08T17:02:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CLI agents trained with RL benefit from selective observation via σ-Reveal and structured credit assignment via A³ that leverages AST action sub-chains and trajectory margins.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07865","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"KL for a KL: On-Policy Distillation with Control Variate Baseline","primary_cat":"cs.LG","submitted_at":"2026-05-08T15:24:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"vOPD stabilizes on-policy distillation gradients by subtracting a closed-form per-token negative reverse KL baseline as a detached control variate, preserving unbiasedness while lowering variance and matching expensive full-vocabulary methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07363","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MISA: Mixture of Indexer Sparse Attention for Long-Context LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:19:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MISA routes to a small subset of indexer heads via block statistics, matching full DSA performance on LongBench with 4-8x fewer heads and 3.82x speedup while recovering over 92% of selected tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07039","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PACEvolve++: Improving Test-time Learning for Evolutionary Search Agents","primary_cat":"cs.LG","submitted_at":"2026-05-07T23:38:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PACEvolve++ uses a phase-adaptive reinforcement learning advisor to decouple hypothesis selection from execution in LLM-driven evolutionary search, delivering faster convergence than prior frameworks on load balancing, recommendation, and protein tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06615","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:32:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SignSGD provably beats SGD by a factor of d under sparse noise via matched ℓ1-norm upper and lower bounds, with an equivalent result for Muon on matrices, and this predicts faster GPT-2 pretraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06230","ref_index":96,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Safactory: A Scalable Agentic Infrastructure for Training Trustworthy Autonomous Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-07T13:21:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Safactory integrates three platforms for simulation, data management, and agent evolution to create a unified pipeline for training trustworthy autonomous AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06125","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Breaking, Stale, or Missing? Benchmarking Coding Agents on Project-Level Test Evolution","primary_cat":"cs.SE","submitted_at":"2026-05-07T12:31:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TEBench is a new project-level benchmark for test evolution showing coding agents achieve only 45-49% F1 on identifying tests needing changes, with stale tests hardest due to reliance on execution failures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05846","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LoopTrap: Termination Poisoning Attacks on LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-05-07T08:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LoopTrap is an automated red-teaming framework that crafts termination-poisoning prompts to amplify LLM agent steps by 3.57x on average (up to 25x) across 8 agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05835","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evaluation Awareness in Language Models Has Limited Effect on Behaviour","primary_cat":"cs.CL","submitted_at":"2026-05-07T08:09:43+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Verbalised evaluation awareness in large reasoning models has only small effects on their outputs across safety and alignment tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05696","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Irminsul: MLA-Native Position-Independent Caching for Agentic LLM Serving","primary_cat":"cs.DC","submitted_at":"2026-05-07T05:36:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Irminsul recovers up to 83% of prompt tokens above exact-prefix matching and delivers 63% prefill energy savings per cache hit on MLA-MoE models by content-hashing CDC chunks and applying closed-form kr correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03677","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Uni-OPD: Unifying On-Policy Distillation with a Dual-Perspective Recipe","primary_cat":"cs.LG","submitted_at":"2026-05-05T12:15:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Uni-OPD unifies on-policy distillation across LLMs and MLLMs with dual-perspective strategies that promote student exploration and enforce order-consistent teacher supervision based on outcome rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02351","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MolViBench: Evaluating LLMs on Molecular Vibe Coding","primary_cat":"cs.CL","submitted_at":"2026-05-04T08:54:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MolViBench is the first benchmark designed to evaluate LLMs on generating executable programs for molecular tasks in drug discovery.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02187","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Alignment Isn't Enough: Response-Path Attacks on LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-05-04T03:35:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A malicious relay can strategically rewrite aligned LLM outputs in BYOK agent architectures to achieve up to 99.1% attack success on benchmarks like AgentDojo and ASB.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00663","ref_index":82,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Affordance Agent Harness: Verification-Gated Skill Orchestration","primary_cat":"cs.RO","submitted_at":"2026-05-01T13:45:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Affordance Agent Harness is a verification-gated orchestration system that unifies skills via an evidence store, episodic memory priors, an adaptive router, and a self-consistency verifier to improve accuracy-cost tradeoffs in open-world affordance grounding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00342","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making Every Verified Token Count: Adaptive Verification for MoE Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-05-01T01:52:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EVICT adaptively truncates draft trees in MoE speculative decoding by combining drafter signals with profiled costs to retain only cost-effective prefixes, delivering up to 2.35x speedup over autoregressive decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00072","ref_index":204,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"XekRung Technical Report","primary_cat":"cs.CR","submitted_at":"2026-04-30T11:50:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"XekRung achieves state-of-the-art performance on cybersecurity benchmarks among same-scale models via tailored data synthesis and multi-stage training while retaining strong general capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27083","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Co-Evolving Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-29T18:24:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoPD integrates multiple expert capabilities by running parallel RLVR training with bidirectional online policy distillation among experts, outperforming mixed RLVR and sequential OPD while surpassing domain-specific experts on text-image-video reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26752","ref_index":50,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GLM-5V-Turbo: Toward a Native Foundation Model for Multimodal Agents","primary_cat":"cs.CV","submitted_at":"2026-04-29T14:49:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GLM-5V-Turbo integrates multimodal perception as a core part of reasoning and execution for agentic tasks, reporting strong results in visual tool use and multimodal coding while keeping text-only performance competitive.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25806","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MAIC-UI: Making Interactive Courseware with Generative UI","primary_cat":"cs.CL","submitted_at":"2026-04-28T16:15:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAIC-UI provides a zero-code authoring system for generating and iteratively editing interactive courseware from educational materials via structured analysis and incremental generation, with lab and classroom evaluations showing usability gains and learning improvements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22577","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QuantClaw: Precision Where It Matters for OpenClaw","primary_cat":"cs.AI","submitted_at":"2026-04-24T14:10:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QuantClaw dynamically routes precision in agent workflows to cut cost by up to 21.4% and latency by 15.7% while keeping or improving task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21916","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MathDuels: Evaluating LLMs as Problem Posers and Solvers","primary_cat":"cs.CL","submitted_at":"2026-04-23T17:57:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-play between LLMs for problem authoring and solving, scored via Rasch modeling, shows that authoring and solving skills are partially decoupled and that the benchmark difficulty evolves with new models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21454","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reasoning Primitives in Hybrid and Non-Hybrid LLMs","primary_cat":"cs.CL","submitted_at":"2026-04-23T09:13:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Reasoning augmentation extends the difficulty range for both architectures, but hybrid models stay robust longer than transformers as sequential dependence increases in state-based recall tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20156","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Temporally Extended Mixture-of-Experts Models","primary_cat":"cs.LG","submitted_at":"2026-04-22T03:50:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Temporally extended MoE layers using the option-critic framework with deliberation costs cut switching rates below 5% while retaining most capability on MATH, MMLU, and MMMLU.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22840","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AeSlides: Incentivizing Aesthetic Layout in LLM-Based Slide Generation via Verifiable Rewards","primary_cat":"cs.CV","submitted_at":"2026-04-21T11:59:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AeSlides is a GRPO-based RL framework that uses verifiable aesthetic metrics to optimize LLM slide generation, achieving large gains in layout quality metrics and human scores with only 5K prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18543","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClawEnvKit: Automatic Environment Generation for Claw-Like Agents","primary_cat":"cs.AI","submitted_at":"2026-04-20T17:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClawEnvKit automates generation of diverse verified environments for claw-like agents from natural language, producing the Auto-ClawEval benchmark of 1,040 environments that matches human-curated quality at 13,800x lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17529","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Single-Language Evidence Is Insufficient for Automated Logging: A Multilingual Benchmark and Empirical Study with LLMs","primary_cat":"cs.SE","submitted_at":"2026-04-19T16:43:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiLogBench shows that LLM performance on automated logging varies substantially across programming languages, demonstrating that single-language evidence is insufficient for general claims about model behavior or tool design.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16972","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MCPO: Mastery-Consolidated Policy Optimization for Large Reasoning Models","primary_cat":"cs.AI","submitted_at":"2026-04-18T11:43:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MCPO fixes vanishing training signals and shrinking weights in GRPO by using a hinge-KL regularizer on mastered prompts and prioritizing majority-correct prompts, yielding higher pass@1 and pass@k on math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13847","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SparseBalance: Load-Balanced Long Context Training with Dynamic Sparse Attention","primary_cat":"cs.LG","submitted_at":"2026-04-15T13:18:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SparseBalance dynamically adjusts sparsity and batches workloads to load-balance sparse attention training, delivering up to 1.33x speedup and 0.46% better long-context performance on LongBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16514","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BARD: Bridging AutoRegressive and Diffusion Vision-Language Models Via Highly Efficient Progressive Block Merging and Stage-Wise Distillation","primary_cat":"cs.CV","submitted_at":"2026-04-15T09:17:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BARD bridges autoregressive and diffusion VLMs with progressive block merging plus stage-wise intra-diffusion distillation, delivering 3x speedup and new SOTA on open dVLMs using under 4.4M data points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13018","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Autonomous Long-Horizon Engineering for ML Research","primary_cat":"cs.CL","submitted_at":"2026-04-14T17:55:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AiScientist improves ML research benchmarks by 10.54 points on PaperBench and reaches 81.82% Any Medal on MLE-Bench Lite through hierarchical control plus durable file-based state instead of conversational handoffs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13016","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking On-Policy Distillation of Large Language Models: Phenomenology, Mechanism, and Recipe","primary_cat":"cs.LG","submitted_at":"2026-04-14T17:54:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy distillation works when student and teacher models share thinking patterns and the teacher adds new capabilities, with success tied to alignment on a small set of high-probability tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12421","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic Insight Generation in VSM Simulations","primary_cat":"cs.CL","submitted_at":"2026-04-14T08:11:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A two-step agentic system for extracting insights from VSM simulations achieves up to 86% accuracy with top LLMs by using progressive data discovery and slim context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11790","ref_index":44,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClawGuard: A Runtime Security Framework for Tool-Augmented LLM Agents Against Indirect Prompt Injection","primary_cat":"cs.CR","submitted_at":"2026-04-13T17:55:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClawGuard enforces deterministic, user-derived access constraints at tool boundaries to block indirect prompt injection without changing the underlying LLM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10866","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OccuBench: Evaluating AI Agents on Real-World Professional Tasks via Language Environment Simulation","primary_cat":"cs.CL","submitted_at":"2026-04-13T00:27:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"OccuBench is a new benchmark for AI agents on real-world occupational tasks via LLM-driven simulators, showing no model dominates all industries, implicit faults are hardest, and larger models with more reasoning perform better.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}