{"total":55,"items":[{"citing_arxiv_id":"2605.13421","ref_index":178,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Combining pre-trained models via localized model averaging","primary_cat":"stat.ME","submitted_at":"2026-05-13T12:16:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Localized model averaging with covariate-dependent weights achieves asymptotic optimality and weight consistency for combining pre-trained models under a general loss framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12476","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Routers Learn the Geometry of Their Experts: Geometric Coupling in Sparse Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:55:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Routers in SMoE models form geometric alignments with their experts through shared gradient directions, enabling effective specialization that auxiliary load-balancing losses tend to disrupt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11277","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sieve: Dynamic Expert-Aware PIM Acceleration for Evolving Mixture-of-Experts Models","primary_cat":"cs.AR","submitted_at":"2026-05-11T22:00:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Sieve dynamically schedules MoE experts across GPU and PIM hardware to handle bimodal token distributions, achieving 1.3x to 1.6x gains in throughput and interactivity over static prior PIM systems on three large models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11093","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enabling Performant and Flexible Model-Internal Observability for LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-05-11T18:01:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DMI-Lib delivers 0.4-6.8% overhead for offline batch LLM inference and ~6% for moderate online serving while exposing rich internal signals across backends, cutting latency overhead 2-15x versus prior observability baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11005","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DisagMoE: Computation-Communication overlapped MoE Training via Disaggregated AF-Pipe Parallelism","primary_cat":"cs.LG","submitted_at":"2026-05-10T05:57:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DisagMoE achieves up to 1.8x faster MoE training by disaggregating attention and FFN layers into disjoint GPU groups with a multi-stage uni-directional pipeline and roofline-based bandwidth balancing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08842","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"XPERT: Expert Knowledge Transfer for Effective Training of Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-09T09:53:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"XPERT extracts and reuses cross-domain expert knowledge from pre-trained MoE LLMs via inference analysis and tensor decomposition to improve performance and convergence in downstream language model training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08568","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Different Prompts, Different Ranks: Prompt-aware Dynamic Rank Selection for SVD-based LLM Compression","primary_cat":"cs.LG","submitted_at":"2026-05-09T00:02:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PARSE trains a prompt-aware linear router on dense-model outputs to select dynamic SVD ranks, improving accuracy up to 10% at 0.6 compression ratio on LLaMA-7B while delivering 2.5x prefill and 2.4x decode speedups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07731","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking EngGPT2-16B-A3B against Comparable Italian and International Open-source LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-08T13:36:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"EngGPT2MoE-16B-A3B matches or beats other Italian models on most international benchmarks but trails top international models such as GPT-5 nano and Qwen3-8B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07494","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DIMoE-Adapters: Dynamic Expert Evolution for Continual Learning in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-08T09:32:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DIMoE-Adapters uses self-calibrated expert evolution and prototype-guided selection to dynamically grow and allocate experts, outperforming prior continual learning methods on vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08292","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hierarchical Mixture-of-Experts with Two-Stage Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-08T09:21:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hi-MoE uses two-level hierarchical routing objectives to enforce group-level balance while promoting within-group specialization, yielding better perplexity and expert utilization than prior MoE baselines in NLP and vision tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07363","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MISA: Mixture of Indexer Sparse Attention for Long-Context LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:19:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MISA routes to a small subset of indexer heads via block statistics, matching full DSA performance on LongBench with 4-8x fewer heads and 3.82x speedup while recovering over 92% of selected tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07260","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Are Experts Misrouted? Counterfactual Routing Analysis in Mixture-of-Experts Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T05:26:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Standard top-k routers in MoE language models often select suboptimal routes for difficult tokens, and updating only the final router layer raises pass@K on AIME and HMMT benchmarks across multiple models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06665","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniPool: A Globally Shared Expert Pool for Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A shared global expert pool in MoE improves validation loss over per-layer experts and allows sublinear expert-parameter growth with depth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06374","ref_index":28,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ResiHP: Taming LLM Training Failures with Dynamic Hybrid Parallelism","primary_cat":"cs.DC","submitted_at":"2026-05-07T14:52:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ResiHP introduces a workload-aware failure detector and dynamic scheduler for hybrid-parallel LLM training that achieves 1.04-4.39x higher throughput than prior resilient systems under failures on a 256-GPU cluster.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05888","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MoE-Hub: Taming Software Complexity for Seamless MoE Overlap with Hardware-Accelerated Communication on Multi-GPU Systems","primary_cat":"cs.AR","submitted_at":"2026-05-07T08:58:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MoE-Hub enables seamless MoE communication overlap via hardware-accelerated destination-agnostic data transmission, delivering 1.40x-3.08x per-layer and 1.21x-1.98x end-to-end speedups over prior systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05607","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Accelerating MoE with Dynamic In-Switch Computing on Multi-GPUs","primary_cat":"cs.AR","submitted_at":"2026-05-07T02:53:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DySHARP accelerates MoE expert parallelism via dynamic multimem addressing and token-centric kernel fusion to cut redundant traffic and deliver up to 1.79x speedup over prior in-switch solutions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05164","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geometry-Aware State Space Model: A New Paradigm for Whole-Slide Image Representation","primary_cat":"cs.CV","submitted_at":"2026-05-06T17:33:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BatMIL uses hybrid hyperbolic-Euclidean geometry, an S4 state-space backbone, and chunk-level mixture-of-experts to outperform prior multiple-instance learning methods on seven whole-slide image datasets across six cancers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05049","ref_index":13,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Piper: Efficient Large-Scale MoE Training via Resource Modeling and Pipelined Hybrid Parallelism","primary_cat":"cs.DC","submitted_at":"2026-05-06T15:47:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Piper introduces resource modeling and pipelined hybrid parallelism for MoE training, delivering 2-3.5X higher MFU than prior frameworks and 1.2-9X better all-to-all bandwidth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04754","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AxMoE: Characterizing the Impact of Approximate Multipliers on Mixture-of-Experts DNN Architectures","primary_cat":"cs.LG","submitted_at":"2026-05-06T11:02:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Approximate multipliers degrade MoE and dense DNNs at different rates; ResNet-20 recovers fully after retraining while VGG models often fail at aggressive approximations except Cluster MoE, and Hard MoE can outperform dense on ViT under cost-matched aggressive approximation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04357","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Coral: Cost-Efficient Multi-LLM Serving over Heterogeneous Cloud GPUs","primary_cat":"cs.DC","submitted_at":"2026-05-05T23:25:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Coral cuts multi-LLM serving costs by up to 2.79x and raises goodput by up to 2.39x on heterogeneous GPUs through adaptive joint optimization and a lossless two-stage decomposition that solves quickly.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03426","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Replacing Parameters with Preferences: Federated Alignment of Heterogeneous Vision-Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-05T07:02:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MoR lets clients train local reward models on private preferences and uses a learned Mixture-of-Rewards with GRPO on the server to align a shared base VLM without exchanging parameters, architectures, or raw data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02960","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ZeRO-Prefill: Zero Redundancy Overheads in MoE Prefill Serving","primary_cat":"cs.LG","submitted_at":"2026-05-03T03:10:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZeRO-Prefill achieves 1.35-1.59x higher throughput for MoE prefill serving by replacing per-layer activation AllToAll with overlapped asynchronous weight AllGather and prefix-aware routing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01420","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Artificial Jagged Intelligence as Uneven Optimization Energy Allocation Capability Concentration, Redistribution, and Optimization Governance","primary_cat":"cs.AI","submitted_at":"2026-05-02T12:37:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AJI frames jagged AI capabilities as lower bounds on performance dispersion arising from concentrated optimization energy allocation under anisotropic objectives, with theorems on tradeoffs and redistribution interventions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01280","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: LLM Serving Needs Mathematical Optimization and Algorithmic Foundations, Not Just Heuristics","primary_cat":"cs.DC","submitted_at":"2026-05-02T06:31:11+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLM serving requires mathematical optimization and algorithms with provable guarantees rather than generic heuristics that fail unpredictably on LLM workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00342","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making Every Verified Token Count: Adaptive Verification for MoE Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-05-01T01:52:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EVICT adaptively truncates draft trees in MoE speculative decoding by combining drafter signals with profiled costs to retain only cost-effective prefixes, delivering up to 2.35x speedup over autoregressive decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26881","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FaaSMoE: A Serverless Framework for Multi-Tenant Mixture-of-Experts Serving","primary_cat":"cs.DC","submitted_at":"2026-04-29T16:47:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FaaSMoE treats MoE experts as on-demand FaaS functions with configurable granularity, using under one-third the resources of a full-model baseline under multi-tenant workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23996","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SMoES: Soft Modality-Guided Expert Specialization in MoE-VLMs","primary_cat":"cs.CV","submitted_at":"2026-04-27T03:23:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SMoES improves MoE-VLM performance and efficiency via soft modality-guided expert routing and inter-bin mutual information regularization, yielding 0.9-4.2% task gains and 56% communication reduction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23036","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Preserving Long-Tailed Expert Information in Mixture-of-Experts Tuning","primary_cat":"cs.LG","submitted_at":"2026-04-24T21:48:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new SFT framework for MoE models combines bias-driven sparsification with gated condenser experts to retain long-tailed expert information, outperforming DenseMixer and ESFT by over 2.5% on math reasoning and commonsense QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21724","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond N-gram: Data-Aware X-GRAM Extraction for Efficient Embedding Parameter Scaling","primary_cat":"cs.CL","submitted_at":"2026-04-23T14:27:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"X-GRAM applies data-aware dynamic token injection with hybrid hashing and local feature extraction to achieve up to 4.4 accuracy point gains over vanilla backbones and 3.2 over retrieval baselines at 0.73B-1.15B scales using 50% smaller tables.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21264","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Online Recruitment with Category-Aware MoE and LLM-based Data Augmentation","primary_cat":"cs.AI","submitted_at":"2026-04-23T04:17:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLM chain-of-thought rewriting of job postings plus category-aware MoE improves person-job fit AUC by 2.4%, GAUC by 7.5%, and live click-through conversion by 19.4%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19654","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FEPLB: Exploiting Copy Engines for Nearly Free MoE Load Balancing in Distributed Training","primary_cat":"cs.DC","submitted_at":"2026-04-21T16:43:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FEPLB reduces token and GEMM stragglers in MoE training by 50-70% using nearly free Copy Engine communication on Hopper architecture.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19344","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Quadruped Parkour Learning: Sparsely Gated Mixture of Experts with Visual Input","primary_cat":"cs.RO","submitted_at":"2026-04-21T11:27:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Sparsely gated MoE policies double the success rate of a real Unitree Go2 quadruped on large-obstacle parkour versus matched-active-parameter MLP baselines while cutting inference time compared with a scaled-up MLP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19147","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nexusformer: Nonlinear Attention Expansion for Stable and Inheritable Transformer Scaling","primary_cat":"cs.LG","submitted_at":"2026-04-21T06:54:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Nexusformer uses a three-stage nonlinear mapping in attention to enable stable, inheritable scaling of transformers, matching baseline perplexity with up to 41.5% less compute when growing from 240M to 440M parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19835","ref_index":28,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Expert Upcycling: Shifting the Compute-Efficient Frontier of Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-04-21T05:53:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Expert upcycling duplicates experts in an existing MoE checkpoint and continues pre-training to match fixed-size baseline performance with 32% less compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18788","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient Mixture-of-Experts LLM Inference with Apple Silicon NPUs","primary_cat":"cs.LG","submitted_at":"2026-04-20T19:52:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NPUMoE accelerates MoE LLM inference on Apple Silicon NPUs via offline-calibrated static expert tiers, grouped execution, and load-aware graph residency, delivering 1.32x-5.55x lower latency and 1.81x-7.37x better energy efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18473","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Train Separately, Merge Together: Modular Post-Training with Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:24:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BAR trains independent domain experts via separate mid-training, SFT, and RL pipelines then composes them with a MoE router to match monolithic retraining performance at lower cost and without catastrophic forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18255","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WiFo-MiSAC: A Wireless Foundation Model for Multimodal Sensing and Communication Integration via Synesthesia of Machines (SoM)","primary_cat":"eess.SP","submitted_at":"2026-04-20T13:30:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WiFo-MiSAC is a task-agnostic foundation model that unifies multimodal wireless signals via tokenization and self-supervised learning with SS-DMoE to achieve strong few-shot performance on beam prediction and channel estimation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17286","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Depth Adaptive Efficient Visual Autoregressive Modeling","primary_cat":"cs.CV","submitted_at":"2026-04-19T06:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DepthVAR adaptively allocates per-token computational depth in VAR models using a cyclic rotated scheduler and dynamic layer masking to achieve 2.3-3.1x inference speedup with minimal quality loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15645","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PINNACLE: An Open-Source Computational Framework for Classical and Quantum PINNs","primary_cat":"cs.LG","submitted_at":"2026-04-17T02:42:08+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PINNACLE is an open-source framework for classical and quantum PINNs that supplies modular training methods and benchmarks showing high sensitivity to architecture choices plus parameter-efficiency gains in some hybrid quantum regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04079","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Handwriting-Based Alzheimer,s Disease Diagnosis Using a Low-Rank Mixture of Experts Deep Learning Framework","primary_cat":"cs.LG","submitted_at":"2026-04-14T16:32:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A low-rank mixture of experts model trained on handwriting data delivers strong Alzheimer's diagnosis performance with substantially reduced parameter activation during inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08936","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"M-IDoL: Information Decomposition for Modality-Specific and Diverse Representation Learning in Medical Foundation Model","primary_cat":"cs.CV","submitted_at":"2026-04-10T04:06:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"M-IDoL learns modality-specific and diverse representations by maximizing inter-modality entropy and minimizing intra-modality uncertainty through information decomposition in MoE subspaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07753","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Symbiotic-MoE: Unlocking the Synergy between Generation and Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-09T03:19:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Symbiotic-MoE introduces modality-aware expert disentanglement and progressive training in a multimodal MoE to achieve synergistic generation and understanding without task interference or extra parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06715","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HQF-Net: A Hybrid Quantum-Classical Multi-Scale Fusion Network for Remote Sensing Image Segmentation","primary_cat":"cs.CV","submitted_at":"2026-04-08T06:21:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HQF-Net reports mIoU gains on three remote-sensing benchmarks by adding quantum circuits to skip connections and a mixture-of-experts bottleneck inside a classical U-Net fused with a DINOv3 backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05960","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Mixture of Experts Foundation Model for Scanning Electron Microscopy Image Analysis","primary_cat":"cs.LG","submitted_at":"2026-04-07T14:52:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A mixture-of-experts transformer foundation model pretrained on diverse SEM images enables generalization across materials and outperforms SOTA on unsupervised defocus-to-focus restoration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04750","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DeepStack: Scalable and Accurate Design Space Exploration for Distributed 3D-Stacked AI Accelerators","primary_cat":"cs.AR","submitted_at":"2026-04-06T15:16:35+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepStack introduces a fast performance model and hierarchical search method for co-optimizing 3D DRAM stacking, interconnects, and distributed scheduling in AI accelerators, delivering up to 9.5x throughput gains over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03044","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"JoyAI-LLM Flash: Advancing Mid-Scale LLMs with Token Efficiency","primary_cat":"cs.CL","submitted_at":"2026-04-03T13:52:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JoyAI-LLM Flash delivers a 48B MoE LLM with 2.7B active parameters per token via FiberPO RL and dense multi-token prediction, released with checkpoints on Hugging Face.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.02276","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Kimi K2.5: Visual Agentic Intelligence","primary_cat":"cs.CL","submitted_at":"2026-02-02T16:17:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kimi K2.5 combines joint text-vision training with an Agent Swarm parallel orchestration framework to reach claimed state-of-the-art results on coding, vision, reasoning, and agent tasks while cutting latency up to 4.5 times.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.10925","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"gpt-oss-120b & gpt-oss-20b Model Card","primary_cat":"cs.CL","submitted_at":"2025-08-08T19:24:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OpenAI releases two open-weight reasoning models, gpt-oss-120b and gpt-oss-20b, trained via distillation and RL with claimed strong results on math, coding, and safety benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.20534","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Kimi K2: Open Agentic Intelligence","primary_cat":"cs.LG","submitted_at":"2025-07-28T05:35:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kimi K2 is a 1-trillion-parameter MoE model that leads open-source non-thinking models on agentic benchmarks including 65.8 on SWE-Bench Verified and 66.1 on Tau2-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.24164","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","primary_cat":"cs.LG","submitted_at":"2024-10-31T17:22:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"π₀ is a vision-language-action flow model trained on diverse multi-platform robot data that supports zero-shot task performance, language instruction following, and efficient fine-tuning for dexterous tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}