{"total":14,"items":[{"citing_arxiv_id":"2606.31991","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Amplifying Membership Signal Through Chained Regeneration","primary_cat":"cs.LG","submitted_at":"2026-06-30T17:29:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MADreMIA amplifies membership inference signals by showing that memorized samples maintain higher coherence and slower degradation in chained regeneration trajectories than non-members.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23626","ref_index":87,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DiT-Reward: Generative Representations for Text-to-Image Reward Modeling","primary_cat":"cs.LG","submitted_at":"2026-06-22T17:19:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DiT-Reward converts pretrained DiT models into reward predictors that outperform HPSv3 on four benchmarks while providing 1.65x inference speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21868","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WiSP: A Working-Set View of Mixture-of-Experts Serving on Extremely Low-Resource Hardware","primary_cat":"cs.LG","submitted_at":"2026-06-20T04:10:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WiSP achieves up to 1.95x decode throughput on low-resource MoE serving by dynamically paging reused experts and using MV-WSA to allocate VRAM between experts and KV cache, with the offline policy performing well on both prefill and decode.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21033","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MoECodec: Image Compression for joint human and machine perception via Mixture-of-Experts","primary_cat":"eess.IV","submitted_at":"2026-06-19T01:56:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MoECodec replaces FFN layers with token-wise MoE plus stable routing and GShMLP experts to support multiple downstream tasks in a single image compression model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02090","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FocusDiT: Masking Queries in Diffusion Transformers for Fine-grained Image Generation","primary_cat":"cs.CV","submitted_at":"2026-06-01T11:18:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"FocusDiT masks non-critical query tokens before they enter the FFN in DiT models, directing capacity toward complex visual details and reporting improved text-to-image results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30045","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GenEraser: Generalizable Video Object Removal via Balanced Text-Mask Guidance and Decoupled Locator-Preserver","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:58:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenEraser proposes MC-MoE with bipartite text guidance, LD-CFG fusion, and a decoupled locator-preserver architecture for generalizable video object and effect removal, claiming 2.16 dB and 1.44 dB gains on ROSE and VOR-Eval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19378","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sparse Mixture-of-Experts Routing in Visual Diffusion Transformers:Diagnosis, Boundary Calibration and Evolutionary Roadmap from Routing Collapse to Selective Deadlock","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:57:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Diagnoses five failure modes in Token-Choice MoE routing for visual diffusion transformers and proposes the Functional Redundancy Hypothesis to explain selective deadlock.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08712","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Articulated Kinematics to Routed Visual Control for Action-Conditioned Surgical Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-09T05:48:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A kinematic-to-visual lifting paradigm combined with hierarchically routed control generates action-conditioned surgical videos with better faithfulness, fidelity, and efficiency.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This objective encourages the router to allocate capacity according to physically meaningful action cues, rather than forcing all modality experts to be used uniformly. Spatiotemporal routing consistency.Existing MoE diffusion models process tokens within a batch- level global pool, ignoring the temporal correlation inherent in sequential video frames [17, 109]. We introduce a Spatiotemporal Routing Consistency (SRC) loss to enforce temporal coherence in the continuous routing distributions, localized to moving instruments: LSRC = 1 T−1 T−1X t=1 X h,w M(t) tool(h, w)∥R t(h, w)−R t−1(h, w)∥2 2 ,(4) where R∈R H×W×N denotes the continuous routing probabilities at frame t, and M(t) tool ∈ {0,1} H×W is the binary spatial mask indicating surgical tool presence (derived from Ksem"},{"citing_arxiv_id":"2605.02641","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mamoda2.5: Enhancing Unified Multimodal Model with DiT-MoE","primary_cat":"cs.CV","submitted_at":"2026-05-04T14:26:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Mamoda2.5 is a 25B-parameter DiT-MoE unified AR-Diffusion model that reaches top video generation and editing benchmarks with 4-step inference up to 95.9x faster than baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"tiotemporal modeling, Mixture-of-Experts (MoE) offers a scalable solution [8]. Large language models have successfully employed routing mechanisms for sparse activation, scaling capacity without proportional compute cost [9, 10]. DeepSeekMoE's fine-grained expert segmentation further enhances specialization and scalability [11]. MoE has also demonstrated significant potential in image generation; for instance, DiT-MoE [12] suc- cessfully scaled the Diffusion Transformer to tens of billions of parameters, while Race-DiT [13] and DiffMoE [14] optimized routing strategies to further improve generation quality and training efficiency. In the video domain, WanVideo 2.2 [4] has explored a coarse-grained two-expert MoE that routes by denoising timestep. Nevertheless,fine-grainedMoE designs, with many specialized experts and learned token-level routing, have"},{"citing_arxiv_id":"2604.19636","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CoInteract: Physically-Consistent Human-Object Interaction Video Synthesis via Spatially-Structured Co-Generation","primary_cat":"cs.CV","submitted_at":"2026-04-21T16:25:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoInteract adds a human-aware mixture-of-experts and spatially-structured co-generation to a diffusion transformer to synthesize videos with stable structures and physically plausible human-object contacts.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"structure supervision to the RGB generator even when the HOI branch is re- moved. 3.2 Human-Aware Mixture-of-Experts While structured co-generation injects interaction priors into the backbone, hands and faces may still exhibit artifacts due to their high-frequency detail and articulation complexity. We therefore incorporate aHuman-Aware Mixture- of-Experts (MoE)module [8,37] that routes tokens to region-specialized ex- perts via a spatially supervised routerR(Fig. 2(c)). We include aSharedexpert that reuses the original DiT FFN as a shortcut path, and three lightweight ex- perts (Head,Hand,Base) implemented as small FFNs, introducing a modest parameter overhead. This enables dedicated capacity for anatomically sensitive"},{"citing_arxiv_id":"2604.07210","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VersaVogue: Visual Expert Orchestration and Preference Alignment for Unified Fashion Synthesis","primary_cat":"cs.CV","submitted_at":"2026-04-08T15:31:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VersaVogue unifies garment generation and virtual dressing via trait-routing attention with mixture-of-experts and an automated multi-perspective preference optimization pipeline that uses DPO without human labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.10079","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Large Spikes in Stochastic Gradient Descent: A Large-Deviations View","primary_cat":"cs.LG","submitted_at":"2026-03-10T09:27:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Large loss spikes in SGD are polynomially likely and serve as the dominant mechanism for escaping sharp minima toward flatter solutions in the NTK regime.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.21788","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InstructMoLE: Instruction-Guided Mixture of Low-rank Experts for Multi-Conditional Image Generation","primary_cat":"cs.CV","submitted_at":"2025-12-25T21:37:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"InstructMoLE replaces per-token routing with instruction-guided global routing for mixture-of-low-rank-experts in diffusion transformers and adds an output-space orthogonality loss to improve multi-conditional image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.06608","ref_index":130,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TripoSG: High-Fidelity 3D Shape Synthesis using Large-Scale Rectified Flow Models","primary_cat":"cs.CV","submitted_at":"2025-02-10T16:07:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TripoSG generates high-fidelity 3D meshes from input images via a large-scale rectified flow transformer and hybrid-trained 3D VAE on a custom 2-million-sample dataset, claiming state-of-the-art fidelity and generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}