{"total":22,"items":[{"citing_arxiv_id":"2606.06060","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReCache: Learning Budget-Aware Caching Schedules for Diffusion Models via REINFORCE","primary_cat":"cs.CV","submitted_at":"2026-06-04T11:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReCache learns recomputation schedules via policy gradients to maximize quality under a target compute budget for any caching mechanism in diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01412","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GPTQ-intrinsic LoRA: A Near-optimal Algorithm for Low-precision Quantization with Low-rank Adaptation","primary_cat":"cs.LG","submitted_at":"2026-05-31T19:17:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"GPTQ-intrinsic LoRA augments GPTQ with intrinsic low-rank compensation via Hessian modification to achieve layer-wise reconstruction bounds that match information-theoretic lower bounds under structural assumptions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00573","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LASER: Loss-Aware Singular-value Decomposition and Rank Allocation for Efficient Low-Precision Vision-Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-30T06:53:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LASER introduces curvature-weighted SVD from second-order loss approximation and loss-aware rank allocation to compress VLMs, reporting over 2.3x decoding speedup under low-precision settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00535","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DREAM-S: Speculative Decoding with Searchable Drafting and Target-Aware Refinement for Multimodal Generation","primary_cat":"cs.LG","submitted_at":"2026-05-30T05:05:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DREAM-S combines neural architecture search, target-aware supernet training, and attention-entropy-guided distillation to accelerate speculative decoding in VLMs, reporting up to 3.85x speedup over standard methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28803","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"{\\Omega}-QVLA: Robust Quantization for Vision-Language-Action Models via Composite Rotation and Per-step Scaling","primary_cat":"cs.CV","submitted_at":"2026-05-27T17:55:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Omega-QVLA is a post-training quantization framework achieving uniform W4A4 for VLA models' LLM backbone and DiT action head via composite SVD-Hadamard rotation and per-step scaling, matching FP16 success rates on LIBERO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26632","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RT-Lynx: Putting the GEMM Sparsity In a Right Way for Diffusion Models","primary_cat":"cs.LG","submitted_at":"2026-05-26T07:09:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RT-Lynx shifts DiT sparsity from weights to activations, reports up to 1.55x linear-layer speedup while preserving generation quality across multiple diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19929","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Breaking Modality Heterogeneity in Low-Bit Quantization for Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-19T14:49:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SplitQ improves low-bit PTQ for VLMs by isolating modality-specific outlier channels via MOCD and applying dual-branch adaptive calibration via ACC, outperforming prior methods on six datasets across W4A8 to W3A2 settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18739","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LongLive-2.0: An NVFP4 Parallel Infrastructure for Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:57:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LongLive-2.0 delivers an NVFP4 parallel infrastructure that enables direct training of long multi-shot autoregressive diffusion video models and achieves up to 2.15x training and 1.84x inference speedups on Blackwell and other GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16901","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CAR-SAM: Cross-Attention Reconstruction for Post-Training Quantization of the Segment Anything Model","primary_cat":"cs.CV","submitted_at":"2026-05-16T09:25:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CAR-SAM introduces MatMul-Aware Compensation and Joint Cross-Attention Reconstruction to enable stable 4-bit post-training quantization of SAM, outperforming prior PTQ methods by 14.6% mAP on SAM-B and 6.6% on SAM-L.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14513","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HASTE: Training-Free Video Diffusion Acceleration via Head-Wise Adaptive Sparse Attention","primary_cat":"cs.CV","submitted_at":"2026-05-14T07:57:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HASTE delivers up to 1.93x speedup on Wan2.1 video DiTs via head-wise adaptive sparse attention using temporal mask reuse and error-guided per-head calibration while preserving video quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12464","ref_index":132,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Search Your Block Floating Point Scales!","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:50:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ScaleSearch optimizes block floating point scales via fine-grained search to cut quantization error by 27% for NVFP4, improving PTQ by up to 15 points on MATH500 for Qwen3-8B and attention PPL by 0.77 on Llama 3.1 70B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00140","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Technical Report: Activation Residual Hessian Quantization (ARHQ) for Low-Bit LLM Quantization","primary_cat":"cs.LG","submitted_at":"2026-04-30T18:55:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ARHQ isolates error-sensitive weight directions in LLMs via truncated SVD on the scaled matrix W G_x^{1/2} from activation residuals, improving SNR and preserving performance under aggressive low-bit quantization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22748","ref_index":221,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agentic World Modeling: Foundations, Capabilities, Laws, and Beyond","primary_cat":"cs.AI","submitted_at":"2026-04-24T17:48:47+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22577","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"QuantClaw: Precision Where It Matters for OpenClaw","primary_cat":"cs.AI","submitted_at":"2026-04-24T14:10:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QuantClaw dynamically routes precision in agent workflows to cut cost by up to 21.4% and latency by 15.7% while keeping or improving task performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[22] Ji-Fu Li, Manyi Zhang, Xiaobo Xia, Han Bao, Haoli Bai, Zhenhua Dong, and Xianzhi Yu. Batquant: Outlier- resilient mxfp4 quantization via learnable block-wise optimization. arXiv preprint arXiv:2603.16590, 2026. [23] Xiaohao Liu, Xiaobo Xia, Manyi Zhang, Ji-Fu Li, Xianzhi Yu, Fei Shen, Xiu Su, See-Kiong Ng, and Tat-Seng Chua. Freeact: Freeing activations for llm quantization. arXiv preprint arXiv:2603.01776, 2026. [24] Muyang Li, Yujun Lin, Zhekai Zhang, Tianle Cai, Xiuyu Li, Junxian Guo, Enze Xie, Chenlin Meng, Jun-Yan Zhu, and Song Han. Svdquant: Absorbing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007, 2024. [25] Zechun Liu, Changsheng Zhao, Igor Fedorov, Bilge Soran, Dhruv Choudhary, Raghuraman Krishnamoorthi, Vikas Chandra, Yuandong Tian, and Tijmen Blankevoort."},{"citing_arxiv_id":"2604.18348","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AdaCluster: Adaptive Query-Key Clustering for Sparse Attention in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-20T14:43:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdaCluster delivers a training-free adaptive query-key clustering framework for sparse attention in video DiTs, yielding 1.67-4.31x inference speedup with negligible quality loss on CogVideoX-2B, HunyuanVideo, and Wan-2.1.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Diffusion model quantization.Quantization has been widely studied to accelerate diffusion models and reduce memory usage. For example, PTQD [12] and PQD [53] in- vestigate post-training quantization for diffusion pipelines, while Q-DM [21] and Q-Diffusion [17] target low-bit quan- tization. On the other hand, low-rank quantization meth- ods such as SVDQuant [18] and IntLoRA [10] exploit a low-rank adapter in conjunction with quantization to further improve efficiency. These methods are orthogonal to ours: they lower numerical precision, while we reduce attention workload through token clustering and selection. Training-based sparse attention.VSA [57] learns dy- namic sparsity via two-stage attention: coarse tiling se-"},{"citing_arxiv_id":"2604.09742","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Efficient Matrix Implementation for Rotary Position Embedding","primary_cat":"cs.LG","submitted_at":"2026-04-10T00:17:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RoME reformulates RoPE as matrix operations to eliminate dimension-specific vector overhead and enable fused execution on modern hardware while remaining mathematically equivalent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06916","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FP4 Explore, BF16 Train: Diffusion Reinforcement Learning via Efficient Rollout Scaling","primary_cat":"cs.LG","submitted_at":"2026-04-08T10:14:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sol-RL decouples FP4-based candidate exploration from BF16 policy optimization in diffusion RL, delivering up to 4.64x faster convergence with maintained or superior alignment performance on models like FLUX.1 and SD3.5.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"the exact FP4 format implemented in the NVIDIA Blackwell architecture, achieving remarkable precision without introducing complex mechanism designs. Quantization of diffusion models is also extensively explored for inference acceleration. To address distribution shifts across denoising timesteps, early works [57, 58, 59] designed timestep-aware calibration and correlation-based noise correction. Recent advancements like SVDQuant [60] have successfully bridged the gap to 4-bit inference by absorbing activation outliers through Singular Value Decomposition (SVD). Quantized inference has been introduced into reinforcement learning to alleviate the massive computational bottle- neck. Frameworks such as FlashRL and QeRL [61, 62] have demonstrated substantial speedups via quantized rollout."},{"citing_arxiv_id":"2605.02905","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"eOptShrinkQ: Near-Lossless KV Cache Compression Through Optimal Spectral Denoising and Quantization","primary_cat":"cs.LG","submitted_at":"2026-04-06T02:05:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"eOptShrinkQ compresses KV caches to ~2.2 bits per entry via optimal spectral shrinkage and quantization, outperforming prior methods on LongBench while matching FP16 on multi-needle retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02570","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WSVD: Weighted Low-Rank Approximation for Fast and Efficient Execution of Low-Precision Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-02T22:49:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WSVD delivers over 1.8x faster VLM decoding via weighted low-rank approximation at fine granularity plus quantization, without accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.20309","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"QuantVLA: Scale-Calibrated Post-Training Quantization for Vision-Language-Action Models","primary_cat":"cs.LG","submitted_at":"2026-02-23T19:55:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QuantVLA is the first post-training quantization framework for VLA models that quantizes the diffusion transformer action head and reports higher task success rates than full-precision baselines with roughly 70% memory savings on the quantized components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.02010","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Four Over Six: More Accurate NVFP4 Quantization with Adaptive Block Scaling","primary_cat":"cs.CL","submitted_at":"2025-12-01T18:59:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Four Over Six adaptively scales blocks in NVFP4 quantization to smaller FP4 values, making representable value distributions more uniform and reducing quantization error especially for near-maximal values.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.09505","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Combating the Memory Walls: Optimization Pathways for Long-Context Agentic LLM Inference","primary_cat":"cs.AR","submitted_at":"2025-09-11T14:49:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PLENA introduces a co-designed system with three optimization pathways for long-context agentic LLM inference, claiming up to 2.23x throughput over A100 and 4.04x energy efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}