{"total":15,"items":[{"citing_arxiv_id":"2606.11541","ref_index":3,"ref_count":3,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WHET: Welding Homomorphic Encryption to Accelerator Architectures","primary_cat":"cs.CR","submitted_at":"2026-06-10T01:04:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WHET applies fine-grained coefficient-to-slot transforms, plaintext compression, and modulus raising plus lightweight hardware tweaks to FHE accelerators, delivering 1.38-8.74x per-area gains and sub-millisecond CKKS bootstrapping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11536","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VIPIR: A Versatile GPU Framework for Integrating Private Information Retrieval Protocols","primary_cat":"cs.CR","submitted_at":"2026-06-10T00:40:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VIPIR introduces two new PIR protocols, ExpPack compression, and GPU optimizations for NTT and GEMM that deliver orders-of-magnitude higher throughput than prior systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10440","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ASTRA-sim 3.0: Next-Level Distributed Machine Learning Simulations via High-Fidelity GPU and Infrastructure Modeling","primary_cat":"cs.DC","submitted_at":"2026-06-09T05:36:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ASTRA-sim 3.0 introduces cache-line load-store simulation, a detailed GPU execution model, and InfraGraph to support high-fidelity distributed machine learning infrastructure simulations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07159","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Distributed Persistence Domain for Persistent Memory Pooling","primary_cat":"cs.ET","submitted_at":"2026-06-05T11:19:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes Distributed Persistence Domain and Persistent CXL Switch to enable low-latency persistence operations at CXL switch level while maintaining crash consistency in disaggregated memory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19405","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A complete discussion on fully reconfigurable, digital, scalable, graph and sparsity-aware near-memory accelerator for graph neural networks","primary_cat":"cs.AR","submitted_at":"2026-05-19T05:59:47+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15638","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ITHICA: Intra-Thread Instruction Checking Approach for Defect-Induced Silent Data Corruptions","primary_cat":"cs.AR","submitted_at":"2026-05-15T05:43:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ITHICA generates functional tests via intra-thread instruction duplication and comparison, detecting 39% more defective servers than baseline methods on over 3000 real CPUs while revealing new defect behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19932","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Efficient Page Migration in Hybrid Memory Systems","primary_cat":"cs.AR","submitted_at":"2026-04-21T19:21:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Duon eliminates TLB shootdown and cache invalidation costs during page migration in flat-address hybrid memory systems by updating mappings in-place, delivering 3.87% IPC gains over prior methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Notable designs in this category include Tagless [21], HSCC [23], and Banshee [38]. However, these OS-managed DRAM Cache designs come with a significant drawback: the application thread experiences a stall whenever there is a tag-miss, which can impact overall system performance. To address this limitation, non-blocking software-managed DRAM Cache designs, such as NOMAD [ 17] has been developed, which aim to minimize the performance overhead caused by tag-misses. This innovative approach eliminates the need for stalling application threads, thereby improving the efficiency of software-managed caching systems. 3.2 Hybrid Cache-Flat Memory Design This hybrid approach seeks to balance the trade-offs between performance and memory availability."},{"citing_arxiv_id":"2604.14626","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ELMoE-3D: Leveraging Intrinsic Elasticity of MoE for Hybrid-Bonding-Enabled Self-Speculative Decoding in On-Premises Serving","primary_cat":"cs.LG","submitted_at":"2026-04-16T05:12:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ELMoE-3D achieves 6.6x average speedup and 4.4x energy efficiency gain for MoE serving on 3D hardware by scaling expert and bit elasticity for elastic self-speculative decoding.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Sparsity Exploitation. In2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA). 69-80. https://doi.org/10.1109/HPCA56546.2023. 10071031 [25] JEDEC Solid State Technology Association. 2019. JESD209-5: Low Power Dou- ble Data Rate 5 (LPDDR5). https://www.jedec.org/standards-documents/docs/ jesd209-5. Standard specification. [26] Dongyun Kam, Myeongji Yun, Sunwoo Yoo, Seungwoo Hong, Zhengya Zhang, and Youngjoo Lee. 2025. Panacea: Novel DNN Accelerator using Accuracy- Preserving Asymmetric Quantization and Energy-Saving Bit-Slice Sparsity . In 2025 IEEE International Symposium on High Performance Computer Architecture (HPCA). IEEE Computer Society, Los Alamitos, CA, USA, 701-715."},{"citing_arxiv_id":"2604.09956","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Logical Compilation for Multi-Qubit Iceberg Patches","primary_cat":"quant-ph","submitted_at":"2026-04-10T23:31:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"A new heuristic compiler for multi-qubit iceberg patches reduces circuit depth by 34 percent, cuts gate counts, and improves fidelity metrics on 71 benchmarks compared with naive mapping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04745","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Energy Cost of Execution-Idle in GPU Clusters","primary_cat":"cs.DC","submitted_at":"2026-04-06T15:10:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Execution-idle accounts for 19.7% of GPU execution time and 10.7% of energy in a large cluster, motivating power management that treats it as a distinct operating state.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Power over time under SM-only and SM+memory execution-idle-aware frequency control. These systems improve utilization by packing complemen- tary workloads onto fewer devices, for example by co-serving online and offline jobs [42], serving multiple LLMs concur- rently [60], or co-serving fine-tuning and inference [38]. On the other hand,autoscaling systemssuch as BlitzS- cale [61], ServerlessLLM [ 10], and INFaaS [ 45] have less clear energy implications. Their primary goal is elasticity and SLO preservation rather than direct energy minimization. By scaling in excess capacity and consolidating load, they may reduce execution-idle indirectly. However, aggressive scale-out to avoid latency degradation can also increase the number of active GPUs and thereby hurt energy efficiency."},{"citing_arxiv_id":"2604.03425","ref_index":3,"ref_count":6,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AEGIS: Scaling Long-Sequence Homomorphic Encrypted Transformer Inference via Hybrid Parallelism on Multi-GPU Systems","primary_cat":"cs.CR","submitted_at":"2026-04-03T19:47:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AEGIS reduces inter-GPU communication by up to 81.3% in self-attention and reaches 96.62% scaling efficiency with 3.86x speedup on four GPUs for 2048-token encrypted Transformer inference.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"Not to mention, the resulting dy- namic adjustment of modulus-chain length can affect the scalability of RNS under limb-parallel execution, as it changes the number of remaining moduli available for partitioned evaluation. Hardware Acceleration.Prior work has explored accelerat- ing individual HE primitives on GPUs [8, 15, 20] and on dedicated ASIC and FPGA platforms [ 3, 36, 56, 57, 63], demonstrating the feasibility of deploying RLWE-based HE schemes on commodity and specialized hardware. At the system level, early multi-device designs [6, 61] exploit limb- and coefficient-parallelism but incur costly format conversions and scale poorly for rotation-intensive workloads. Application-aware FPGA/ASIC architectures such as"},{"citing_arxiv_id":"2604.02473","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Analyzing Reverse Address Translation Overheads in Multi-GPU Scale-Up Pods","primary_cat":"cs.DC","submitted_at":"2026-04-02T19:08:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Simulation study shows cold TLB misses in reverse address translation dominate latency for small collectives in multi-GPU pods, causing up to 1.4x degradation, while larger ones see diminishing returns.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Dies 8x compute, 2x I/O [6] Compute Unit 2.2 GHz, 256 per GPU [6] HBM 150ns access latency [73] Reverse Translation Config [26, 66] L1 Link TLB 32-entry [61], fully-assoc, 50 ns hit lat [50], private/UALink Station, 256-entry MSHR L2 Link TLB 512 entry [61], 2-way set assoc, 100 ns hit lat [50], LRU replacement policy, shared across UALink stations per GPU Link MMU [66] 5-level page table with page walk cache (16,32,64,128 entries [61], 2-way, 50ns latency) , shared walker, 100 parallel PTWs Inter-GPU UALink Configuration UALink Switch Single level clos, 300ns latency [65, 98] UALink Station 16 per GPU, 4 lanes per station (combined as 1 x4 port or link), 200Gbps effective BW/lane [98] UALink Link 800 Gbps cumulative bandwidth, 300 ns"},{"citing_arxiv_id":"2602.15172","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Turbo-Charged Mapper: Fast and Optimal Mapping for Energy-efficient and Low-latency Accelerator Design","primary_cat":"cs.AR","submitted_at":"2026-02-16T20:21:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"TCM finds provably optimal DNN accelerator mappings by pruning the search space up to 32 orders of magnitude with a new dataplacement concept, delivering 1.2-6.5x better energy-delay-product in 17 seconds instead of hours.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.15166","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Fast and Fusiest: An Optimal Fusion-Aware Mapper for Accelerator Design","primary_cat":"cs.AR","submitted_at":"2026-02-16T20:08:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FFM finds optimal fused mappings for tensor accelerators over 10,000 times faster than prior mappers while cutting energy-delay product by up to 1.8x versus hand-tuned designs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.17265","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DISCA: A Digital In-memory Stochastic Computing Architecture Using A Compressed Bent-Pyramid Format","primary_cat":"cs.AR","submitted_at":"2025-11-21T14:13:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DISCA achieves 3.59 TOPS/W per bit energy efficiency for matrix multiplication at 500 MHz in 180 nm CMOS using a compressed Bent-Pyramid stochastic format.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}