{"total":10,"items":[{"citing_arxiv_id":"2606.16497","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"daVinci-kernel: Co-Evolving Skill Selection, Summarization, and Utilization via RL for GPU Kernel Optimization","primary_cat":"cs.LG","submitted_at":"2026-06-15T09:58:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"daVinci-kernel is a multi-agent RL system that co-evolves skill selection, policy generation, and summarization via shared LLM and REINFORCE to optimize GPU kernels, reporting higher KernelBench scores than prior RL models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04847","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MusaCoder: Native GPU Kernel Generation with Full-Stack Training on Moore Threads GPU","primary_cat":"cs.CV","submitted_at":"2026-06-03T13:15:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MusaCoder combines kernel-oriented data synthesis, diversity-preserving fine-tuning, and stabilized RL with MooreEval to produce correct, fast GPU kernels, with its 27B model setting new SOTA on KernelBench and a MUSA variant.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28213","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning When to Optimize: Verified Optimization Skills from Expert GPU-Kernel Lineages","primary_cat":"cs.AI","submitted_at":"2026-05-27T09:32:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KLineage derives verified optimization skills from backward lineages of expert GPU kernels to guide LLM agents toward higher-quality and more efficient kernels than memory-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23215","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FastKernels: Benchmarking GPU Kernel Generation in Production","primary_cat":"cs.LG","submitted_at":"2026-05-22T04:19:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"FastKernels is a production-aligned benchmark covering 96.2% of HuggingFace Transformers that reveals state-of-the-art kernel agents deliver at most 0.94x aggregate speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17978","ref_index":76,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AutoVecCoder: Teaching LLMs to Generate Explicitly Vectorized Code","primary_cat":"cs.CL","submitted_at":"2026-05-18T07:33:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AutoVecCoder combines VecPrompt for automated intrinsic knowledge synthesis and VecRL for efficiency-aligned RL to train an 8B LLM that achieves SOTA on SimdBench SSE/AVX subsets and sometimes exceeds -O3 compiler results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16819","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentKernelArena: Generalization-Aware Benchmarking of GPU Kernel Optimization Agents","primary_cat":"cs.CL","submitted_at":"2026-05-16T05:25:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AgentKernelArena is a new open benchmark that measures complete AI agent workflows on 196 GPU kernel tasks with correctness, performance, and generalization checks to unseen configurations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30359","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Kernel Foundry: A Diagnosis-driven Evolutionary Kernel Optimizer with Multi-Experts","primary_cat":"cs.NE","submitted_at":"2026-05-08T03:41:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Kernel Foundry improves GPU kernel correctness and performance on KernelBench via expert-guided evolutionary search with diagnostic feedback and an experience library, reaching 100% correctness on Level 2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04956","ref_index":5,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"KernelBenchX: A Comprehensive Benchmark for Evaluating LLM-Generated GPU Kernels","primary_cat":"cs.LG","submitted_at":"2026-05-06T14:18:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KernelBenchX benchmark shows task category explains nearly three times more variance in LLM kernel correctness than method choice, iterative refinement boosts correctness but reduces performance, and quantization remains unsolved.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.28342","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Kernel-Smith: A Unified Recipe for Evolutionary Kernel Optimization","primary_cat":"cs.CL","submitted_at":"2026-03-30T12:12:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Kernel-Smith combines evolutionary search with RL post-training to generate optimized GPU kernels, achieving SOTA speedups on KernelBench that beat Gemini-3.0-pro and Claude-4.6-opus on NVIDIA Triton and generalize to MetaX MACA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.23566","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AscendOptimizer: Episodic Agent for Ascend NPU Operator Optimization","primary_cat":"cs.LG","submitted_at":"2026-03-24T08:54:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AscendOptimizer combines kernel rewinding for reusable experience with evolutionary search on hardware feedback to optimize Ascend NPU operators, delivering 1.21x geometric-mean speedup and faster performance on 53.47% of 101 tested operators versus baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}