{"total":20,"items":[{"citing_arxiv_id":"2606.28565","ref_index":49,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"KernelSight-LM: A Kernel-Level LLM Inference Simulator","primary_cat":"cs.PF","submitted_at":"2026-06-26T19:43:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KernelSight-LM simulates LLM inference at kernel granularity with cross-generation (12.1% per-kernel error) and target-measured (3.8% error) tiers, yielding end-to-end median errors of 15.4%/12.8%/3.0% and 14.3%/6.2%/2.7% for TTFT/TPOT/throughput across six model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10440","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ASTRA-sim 3.0: Next-Level Distributed Machine Learning Simulations via High-Fidelity GPU and Infrastructure Modeling","primary_cat":"cs.DC","submitted_at":"2026-06-09T05:36:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ASTRA-sim 3.0 introduces cache-line load-store simulation, a detailed GPU execution model, and InfraGraph to support high-fidelity distributed machine learning infrastructure simulations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09200","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Resource-aware Computation-Communication Overlap for multi-GPU ML Workloads","primary_cat":"cs.DC","submitted_at":"2026-06-08T08:33:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A method using shared-memory occupancy shaping and elevated communication priority achieves up to 25.5% faster multi-GPU ML execution on NVIDIA and AMD GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07019","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PCCL: Process Group-Aware Scalable and Generic Collective Algorithm Synthesizer","primary_cat":"cs.DC","submitted_at":"2026-06-05T08:08:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PCCL synthesizes near-optimal topology-aware collective algorithms for arbitrary patterns while being process group-aware and scalable to subsets of devices.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26930","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Revisiting Bruck: Phase-Efficient All-to-All Communication in Reconfigurable Networks","primary_cat":"cs.DC","submitted_at":"2026-05-26T12:24:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReTri achieves all-to-all in ⌈log₃ n⌉ phases for ORNs by co-designing bidirectional exchanges and reconfiguration strategy, with simulations showing up to 10× improvement over static and 2.1× over prior reconfigurable Bruck.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22936","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ACALSim: A Scalable Parallel Simulation Framework for High-Performance System Design Space Exploration","primary_cat":"cs.AR","submitted_at":"2026-05-21T18:10:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ACALSim is a new simulation framework with customizable threading, event-driven execution, and shared-memory model that reports over 14x speedup versus SST and enables simulation of large LLaMA models that SST cannot complete.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20047","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Taking Cryptography Out of the Data Path via Near-Memory Processing in DRAM","primary_cat":"cs.CR","submitted_at":"2026-05-19T16:06:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Real-world PIM on UPMEM accelerates cryptographic algorithms when computation is distributed across multiple DRAM ranks, outperforming CPUs at full scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19169","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Modeling the Impact of Fiber Latency on Compute-Communication Overlap in Geo-Distributed Multi-Datacenter AI Training","primary_cat":"cs.PF","submitted_at":"2026-05-18T22:46:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Discrete-event simulation finds optimal 10-100 km separation between AI clusters where hollow-core fiber provides 25% higher compute-communication overlap in geo-distributed data-parallel training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17164","ref_index":17,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Charon: A Unified and Fine-Grained Simulator for Large-Scale LLM Training and Inference","primary_cat":"cs.DC","submitted_at":"2026-05-16T21:28:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Charon is a unified modular simulator that predicts LLM training and inference performance with under 5.35% error and identifies throughput improvements over baselines in a real deployment case.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15617","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Few GPUs, A Whole Lotta Scale: Faithful LLM Training Emulation with PrismLLM","primary_cat":"cs.DC","submitted_at":"2026-05-15T04:58:20+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PrismLLM constructs a sliced execution graph and uses hybrid emulation to faithfully reproduce performance and memory behavior of up to 8192-GPU LLM training runs on fewer than 1% of the original GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14249","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EnergyLens: Predictive Energy-Aware Exploration for Multi-GPU LLM Inference Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-14T01:37:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EnergyLens predicts multi-GPU LLM inference energy consumption with 9-13% MAPE and identifies configurations with up to 52x energy efficiency differences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12766","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bridge: Optimizing Collective Communication Schedules in Reconfigurable Networks with Reusable Subrings","primary_cat":"cs.NI","submitted_at":"2026-05-12T21:30:46+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Bridge reduces All-to-All completion time by typically 3x to 10x and improves AllReduce by up to 6.6x over Ring by reusing optical subrings across multiple steps in reconfigurable networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17550","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Flint: Compiler Enabled Cluster-Free Design Space Exploration for Distributed ML","primary_cat":"cs.DC","submitted_at":"2026-04-19T17:41:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Flint generates compiler-derived workload graphs that support cluster-free design space exploration for distributed machine learning systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16043","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evaluating SYCL as a Unified Programming Model for Heterogeneous Systems","primary_cat":"cs.DC","submitted_at":"2026-04-17T13:16:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Current SYCL implementations show inconsistencies in memory management (USM vs buffers) and kernel models (NDRange vs hierarchical) that reduce cross-platform reliability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11109","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Record-Remix-Replay: Hierarchical GPU Kernel Optimization using Evolutionary Search","primary_cat":"cs.DC","submitted_at":"2026-04-13T07:25:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"R^3 optimizes full scientific applications on GPUs better than tuning kernel parameters or compiler flags alone while running nearly an order of magnitude faster than modern evolutionary search methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04750","ref_index":108,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DeepStack: Scalable and Accurate Design Space Exploration for Distributed 3D-Stacked AI Accelerators","primary_cat":"cs.AR","submitted_at":"2026-04-06T15:16:35+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepStack introduces a fast performance model and hierarchical search method for co-optimizing 3D DRAM stacking, interconnects, and distributed scheduling in AI accelerators, delivering up to 9.5x throughput gains over baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"PDK [17] and extracted post-synthesis area figures as reference anchors. During exploration, compute-unit areas are linearly scaled by throughput [ 51, 113]. On-chip SRAM areas are decomposed into capacity-proportional and bandwidth-proportional parts, cal- ibrated from Memory Compiler-generated macros [74]. Memory controller area scales with peak DRAM bandwidth based on [108], with additional DRAM peripheral overhead derived from published logic-die breakdowns [51, 74]. On-chip interconnect area is scaled from the baseline with sub-linear bandwidth elasticity. Total die area includes a 15% overhead for control and routing, observed from our baseline synthesis. Designs exceeding the area budget are pruned before performance evaluation."},{"citing_arxiv_id":"2604.02473","ref_index":110,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Analyzing Reverse Address Translation Overheads in Multi-GPU Scale-Up Pods","primary_cat":"cs.DC","submitted_at":"2026-04-02T19:08:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Simulation study shows cold TLB misses in reverse address translation dominate latency for small collectives in multi-GPU pods, causing up to 1.4x degradation, while larger ones see diminishing returns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.15172","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Turbo-Charged Mapper: Fast and Optimal Mapping for Energy-efficient and Low-latency Accelerator Design","primary_cat":"cs.AR","submitted_at":"2026-02-16T20:21:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"TCM finds provably optimal DNN accelerator mappings by pruning the search space up to 32 orders of magnitude with a new dataplacement concept, delivering 1.2-6.5x better energy-delay-product in 17 seconds instead of hours.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.08193","ref_index":86,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lifetime-Aware Design for Item-Level Intelligence at the Extreme Edge","primary_cat":"cs.AR","submitted_at":"2025-09-09T23:53:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FlexiFlow optimizes carbon footprint for item-level intelligence on flexible electronics by modeling lifetime variation, delivering 1.62X microarchitectural and 14.5X algorithmic reductions plus a 30.9 kHz tape-out.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"carbon footprint per wafer is derived from a cradle-to-gate LCA following ISO 14040 and ISO 14044 guidelines. The im- pact assessment uses the ReCiPe midpoint (H) method [41], which incorporates the methodology of the Intergovernmen- tal Panel on Climate Change to measure carbon footprints. Life cycle modeling is conducted using SimaPro 9.2 soft- ware [86] integrated with the Ecoinvent v3.8 database [115]. For the Pragmatic Semiconductor fabrication facilities, car- bon accounting is done on the per-wafer scale, so embodied carbon is calculated as follows: CEmbodied (kg CO2e)= Die Area Active Wafer Area×Wafer Yield ×kg CO 2e/wafer This value reflects one-time emissions incurred during man- ufacturing, independent of application lifetime."},{"citing_arxiv_id":"2404.11591","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The EDGE Language: Extended General Einsums for Graph Algorithms","primary_cat":"cs.DS","submitted_at":"2024-04-17T17:42:48+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}