{"total":13,"items":[{"citing_arxiv_id":"2605.10195","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Breaking the Reward Barrier: Accelerating Tree-of-Thought Reasoning via Speculative Exploration","primary_cat":"cs.LG","submitted_at":"2026-05-11T08:45:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPEX accelerates Tree-of-Thought LLM reasoning 1.2-3x via speculative path selection, dynamic budget allocation across queries, and adaptive early termination, with up to 4.1x when combined with token speculative decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08524","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Unleashing Scalable Context Parallelism for Foundation Models Pre-Training via FCP","primary_cat":"cs.DC","submitted_at":"2026-05-08T22:16:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FCP shards sequences at block level with flexible P2P communication and bin-packing to achieve near-linear scaling up to 256 GPUs and 1.13x-2.21x higher attention MFU in foundation model pre-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08467","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CUDAHercules: Benchmarking Hardware-Aware Expert-level CUDA Optimization for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-08T20:35:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CUDAHercules benchmark demonstrates that leading LLMs generate functional CUDA code but fail to recover expert-level optimization strategies needed for peak performance on Ampere, Hopper, and Blackwell GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07985","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Dooly: Configuration-Agnostic, Redundancy-Aware Profiling for LLM Inference Simulation","primary_cat":"cs.DC","submitted_at":"2026-05-08T16:44:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Dooly reduces LLM inference profiling costs by 56.4% via configuration-agnostic taint-based labeling and selective database reuse, delivering simulation accuracy within 5% MAPE for TTFT and 8% for TPOT across 12 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06068","ref_index":80,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VibeServe: Can AI Agents Build Bespoke LLM Serving Systems?","primary_cat":"cs.AI","submitted_at":"2026-05-07T11:54:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"VibeServe demonstrates that AI agents can synthesize bespoke LLM serving systems end-to-end, remaining competitive with vLLM in standard settings while outperforming it in six non-standard scenarios involving unusual models, workloads, or hardware.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00616","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM-Emu: Native Runtime Emulation of LLM Inference via Profile-Driven Sampling","primary_cat":"cs.DC","submitted_at":"2026-05-01T12:35:21+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM-Emu is a serving-native emulator for vLLM that replaces GPU execution with profile-driven latency sampling and achieves under 5% error on TPOT, ITL, E2E latency, and throughput across multiple models, GPUs, and workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27476","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EdgeFM: Efficient Edge Inference for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-30T06:18:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EdgeFM is an agent-driven framework that strips non-essential features from VLMs and packages reusable optimized kernels, achieving up to 1.49x speedup over TensorRT-Edge-LLM on NVIDIA Orin while enabling first end-to-end deployment on Horizon Journey hardware.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26039","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RaMP: Runtime-Aware Megakernel Polymorphism for Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-04-28T18:20:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RaMP uses a hardware-derived performance region analysis and a four-parameter wave cost model to select optimal polymorphic kernel configurations for MoE inference from runtime expert histograms, delivering 1.22x kernel and 1.30x end-to-end speedups with 0.93% mean regret after brief profiling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19241","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UniEP: Unified Expert-Parallel MoE MegaKernel for LLM Training","primary_cat":"cs.DC","submitted_at":"2026-04-21T08:49:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UniEP fuses MoE communication and computation into unified MegaKernels with deterministic token ordering, delivering 1.03x-1.38x speedups over prior work while preserving training accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18348","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AdaCluster: Adaptive Query-Key Clustering for Sparse Attention in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-20T14:43:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdaCluster delivers a training-free adaptive query-key clustering framework for sparse attention in video DiTs, yielding 1.67-4.31x inference speedup with negligible quality loss on CogVideoX-2B, HunyuanVideo, and Wan-2.1.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14825","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Nautilus: An Auto-Scheduling Tensor Compiler for Efficient Tiled GPU Kernels","primary_cat":"cs.PL","submitted_at":"2026-04-16T09:55:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Nautilus auto-compiles math-like tensor descriptions into optimized GPU kernels, delivering up to 42% higher throughput than prior compilers on transformer models across NVIDIA GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14141","ref_index":95,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Geometric Context Transformer for Streaming 3D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-15T17:58:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LingBot-Map is a streaming 3D reconstruction model built on a geometric context transformer that combines anchor context, pose-reference window, and trajectory memory to deliver accurate, drift-resistant results at 20 FPS over sequences longer than 10,000 frames.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.02922","ref_index":115,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RetroInfer: A Vector Storage Engine for Scalable Long-Context LLM Inference","primary_cat":"cs.LG","submitted_at":"2025-05-05T18:01:17+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}