{"total":14,"items":[{"citing_arxiv_id":"2606.18042","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Latency Prediction for LLM Inference on NPU Systems","primary_cat":"cs.DC","submitted_at":"2026-06-16T15:18:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LENS predicts NPU LLM inference latency with 2.15% mean error by profiling each bucket with two E2E measurements and composing results to capture bucketing non-linearity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11541","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WHET: Welding Homomorphic Encryption to Accelerator Architectures","primary_cat":"cs.CR","submitted_at":"2026-06-10T01:04:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WHET applies fine-grained coefficient-to-slot transforms, plaintext compression, and modulus raising plus lightweight hardware tweaks to FHE accelerators, delivering 1.38-8.74x per-area gains and sub-millisecond CKKS bootstrapping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22936","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ACALSim: A Scalable Parallel Simulation Framework for High-Performance System Design Space Exploration","primary_cat":"cs.AR","submitted_at":"2026-05-21T18:10:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ACALSim is a new simulation framework with customizable threading, event-driven execution, and shared-memory model that reports over 14x speedup versus SST and enables simulation of large LLaMA models that SST cannot complete.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21952","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NasZip: Software and Hardware Co-Design to Accelerate Approximate Nearest Neighbor Search with DIMM-Based Near-Data Processing","primary_cat":"cs.AR","submitted_at":"2026-05-21T03:36:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NasZip delivers up to 8.4x speedup over CPU baselines and 1.69x over prior NDP accelerators for ANNS by combining near-data processing with statistics-based PCA early exiting, dynamic-float encoding, and data-aware neighbor mapping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19005","ref_index":4,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rewrite System Showdown: Stochastic Search vs. EqSat","primary_cat":"cs.PL","submitted_at":"2026-05-18T18:27:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical comparison of equality saturation versus stochastic search on five benchmarks to evaluate if e-graphs are superior for rewrite-based optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16637","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HexAGenT: Efficient Agentic LLM Serving via Workflow- and Heterogeneity-Aware Scheduling","primary_cat":"cs.DC","submitted_at":"2026-05-15T21:09:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HexAGenT reduces the SLO scale required for timely agentic LLM workflow completion by an average of 20.1% at 95% attainment and 33.0% at 99% attainment on heterogeneous A100/H100/H200 clusters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20105","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EnergAIzer: Fast and Accurate GPU Power Estimation Framework for AI Workloads","primary_cat":"cs.AR","submitted_at":"2026-04-22T02:02:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EnergAIzer predicts module-level GPU utilization from structured kernel patterns and feeds it into a power model to estimate dynamic power with 8% error on Ampere GPUs and 7% on H100 forecasts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19503","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReaLB: Real-Time Load Balancing for Multimodal MoE Inference","primary_cat":"cs.DC","submitted_at":"2026-04-21T14:22:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ReaLB balances multimodal MoE inference loads by switching vision-heavy experts to lower FP4 precision per device rank, hiding the change in the dispatch phase to deliver 1.10-1.32x speedup with <1% accuracy degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18529","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HybridGen: Efficient LLM Generative Inference via CPU-GPU Hybrid Computing","primary_cat":"cs.PF","submitted_at":"2026-04-20T17:25:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HybridGen achieves 1.41x-3.2x average speedups over six prior KV cache methods for LLM inference by using attention logit parallelism, a feedback-driven scheduler, and semantic-aware KV cache mapping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14626","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ELMoE-3D: Leveraging Intrinsic Elasticity of MoE for Hybrid-Bonding-Enabled Self-Speculative Decoding in On-Premises Serving","primary_cat":"cs.LG","submitted_at":"2026-04-16T05:12:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ELMoE-3D achieves 6.6x average speedup and 4.4x energy efficiency gain for MoE serving on 3D hardware by scaling expert and bit elasticity for elastic self-speculative decoding.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"DGX Spark and Apple Mac Studio [ 1, 46] can host such models locally, but serving MoE poses a fundamental memory bottleneck. Since activated experts are independently selected per token, batch- ing or prefill operations activate nearly all experts collectively. Al- though computation remains sparse per token, memory activation becomes effectively dense across the batch [2, 70]. To address this bandwidth bottleneck, memory-centric archi- tectures have been explored. PIM [19, 70] and NMP [35, 48] alle- viate raw bandwidth limitations, but MoE's low arithmetic inten- sity at expert layers limits their compute utilization. 3D-IC with hybrid bonding [67, 69] places high-bandwidth memory directly above the compute die, but its limited capacity leads to signifi-"},{"citing_arxiv_id":"2604.03425","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AEGIS: Scaling Long-Sequence Homomorphic Encrypted Transformer Inference via Hybrid Parallelism on Multi-GPU Systems","primary_cat":"cs.CR","submitted_at":"2026-04-03T19:47:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AEGIS reduces inter-GPU communication by up to 81.3% in self-attention and reaches 96.62% scaling efficiency with 3.86x speedup on four GPUs for 2048-token encrypted Transformer inference.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"alone accounts for59%of runtime, with key switching contributing an additional34%. Down to the computation graph, in BSGS matrix multiplication [22, 34], each modulus-switching step triggers an AllGather to synchronize RNS limbs across devices (Figure 5(a)). The total rotation cost accumulates as 𝑡rot =𝑛 rot · (𝜏ks +𝜏 comm), and long-sequence inference requires tens of thousands of rota- tions [32, 34, 48, 54]. Consequently, communication dominates la- tency; Figure 5(b) shows that59%of a single matrix multiplication is spent waiting on communication. Beyond communication costs, limb parallelism is fundamentally limited by modulus depth. A level-𝑙 ciphertext can be partitioned across at most 𝑙 devices, since each RNS limb corresponds to one"},{"citing_arxiv_id":"2602.22457","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CCCL: Node-Spanning GPU Collectives with CXL Memory Pooling","primary_cat":"cs.DC","submitted_at":"2026-02-25T22:38:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CCCL delivers 1.34-1.94x faster cross-node GPU collectives via CXL memory pooling than 200 Gbps InfiniBand RDMA, with 1.11x LLM training speedup and 2.75x hardware cost reduction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.22267","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aquas: Enhancing Domain Specialization through Holistic Hardware-Software Co-Optimization based on MLIR","primary_cat":"cs.AR","submitted_at":"2025-11-27T09:43:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Aquas delivers a holistic hardware-software co-optimization framework on MLIR that models memory interfaces with cache effects and uses an e-graph retargetable compiler, achieving up to 15.61x speedup with 14.5% area overhead across four domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.20782","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Optimism in Equality Saturation","primary_cat":"cs.PL","submitted_at":"2025-11-25T19:19:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new abstract interpretation algorithm enables sound optimistic analysis of e-graphs during equality saturation, unifying it with non-destructive rewriting and improving precision on cyclic SSA programs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}