{"total":27,"items":[{"citing_arxiv_id":"2605.23045","ref_index":13,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"The TIME Machine: On The Power of Motion for Efficient Perception","primary_cat":"cs.CV","submitted_at":"2026-05-21T21:22:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TIME is a motion-based embedding from point tracks, trained only on synthetic data via masked autoencoding, that matches state-of-the-art video model performance with up to 10,000x less training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21661","ref_index":10,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Hierarchical Variational Policies for Reward-Guided Diffusion","primary_cat":"cs.LG","submitted_at":"2026-05-20T19:13:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A hierarchical variational formulation amortizes test-time guidance in diffusion models to achieve strong quality-speed tradeoffs with significantly reduced inference compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18464","ref_index":5,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"PERL: Parameter Efficient Reasoning in CLIP Latent Space","primary_cat":"cs.CV","submitted_at":"2026-05-18T14:25:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PERL augments frozen CLIP with a shared recurrent reasoning module of roughly 6K parameters that iteratively refines representations via latent token injection, delivering strong base-to-novel and transfer performance across 15 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17633","ref_index":6,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SparseSAM: Structured Sparsification of Activations in Segment Anything Models","primary_cat":"cs.CV","submitted_at":"2026-05-17T19:54:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SparseSAM achieves 2x faster inference and 2.8x memory reduction in SAM with only 0.004 mIoU loss at 0.4 density via Stripe-Sort Attention and Residual-Consistency MLP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16147","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Registers Matter for Pixel-Space Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-15T16:27:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Register tokens enhance pixel-space DiT training and output quality via cleaner high-noise feature maps, and a dual-stream design adds further gains with little overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16423","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Nonlinear Bipolar Compensation: Handling Outliers in Post-Training Quantization","primary_cat":"cs.CV","submitted_at":"2026-05-14T14:55:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Nonlinear Bipolar Compensation with Bipolar Logarithmic Transformation reduces outlier effects in post-training quantization by performing compensation in a compressed transformed space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14413","ref_index":4,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MahaVar: OOD Detection via Class-wise Mahalanobis Distance Variance under Neural Collapse","primary_cat":"cs.LG","submitted_at":"2026-05-14T05:58:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MahaVar augments the Mahalanobis OOD score with class-wise distance variance, which is theoretically higher for in-distribution samples under relaxed Neural Collapse geometry.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12678","ref_index":18,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"No One Knows the State of the Art in Geospatial Foundation Models","primary_cat":"cs.CV","submitted_at":"2026-05-12T19:29:51+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An audit of 152 papers reveals that geospatial foundation models lack standardized evaluations, training controls, and weight releases, so no one knows the state of the art.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12491","ref_index":146,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Elastic Attention Cores for Scalable Vision Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:59:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VECA learns effective visual representations using core-periphery attention where patches interact exclusively via a resolution-invariant set of learned core embeddings, achieving linear O(N) complexity while maintaining competitive performance.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"datasets compared to the full model with 64 core tokens. Overall, these results indicate that VECA effectively captures high-quality dense information through a compact set of learnable core tokens with linear-time inference. Image Classification.Image classification examines a model's capability to understand global information. Table 2 includes the results on ImageNet-1K (IN1K) [146], ImageNetV2 (INv2) [147], ImageNet-ReaL (IN-Real) [ 148], Places365 (Places) [ 149], Food101 (Food) [ 150], SUN397 (SUN) [151], Oxford-Pets (Oxford) [ 152], CUB-200 (CUB) [ 153]. Note that for fairness, INv2 and IN-Real are direct probe transfers from IN1k without further training. Here, VECA demonstrates strong performance on image classification across all datasets."},{"citing_arxiv_id":"2605.11563","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"TCP-SSM: Efficient Vision State Space Models with Token-Conditioned Poles","primary_cat":"cs.CV","submitted_at":"2026-05-12T05:49:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TCP-SSM conditions stable poles on visual tokens to explicitly control memory decay and oscillation in SSMs, cutting computation up to 44% while matching or exceeding accuracy on classification, segmentation, and detection.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Here Lblk is the number of distilled layers, Mℓ the token count, Eℓ the channel dimension, and sg(·) the stop-gradient. The total loss L=L task +λ distillLdistill leverages this intermediate guidance to encourage our efficient operator to emulate the teacher's representations while jointly optimizing the final downstream task objectiveL task. 4 Experiments 4.1 Image Classification Settings:We evaluate on the ImageNet-1K dataset [ 9] and report single-crop top-1 and top-5 validation accuracies. For fair comparison, we follow the DeiT [ 54] training recipe with AdamW optimization, cosine learning-rate decay, and standard augmentations including random resized cropping, horizontal flipping, label smoothing, and MixUp [70]. All experiments are performed on six RTX PRO 6000 Blackwell GPUs."},{"citing_arxiv_id":"2605.11107","ref_index":6,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Birds of a Feather Flock Together: Background-Invariant Representations via Linear Structure in VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-11T18:13:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Exploiting linear structure in VLM embeddings, a synthetic-data pre-training method yields background-invariant representations that exceed 90% worst-group accuracy on Waterbirds even under 100% spurious correlation with no minority examples in training.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"We evaluate Equation 2 on 300,000 scene composites (Ia,b) and their constituents (Ia, Ib), constructed from random MS-COCO objects paired with random Places365 backgrounds [20, 47]. Specifically, we compare CLIP and SigLIP2 vision encoders against supervised vision encoders (ImageNet-1K/21K) and self-supervised (SSL) representatives like MAE and DINOv2 [6, 11, 26, 31]. Finally, we evaluate both ViT and ConvNeXt backbones [8, 22], with results reported in Table 1. To ensure parity with VLM designs, all embeddings are unit-normalized for a fair comparison. Table 1: S scores (mean ± std over 300,000 composites) across VLM, supervised, and SSL backbones; as S approaches 1, the degree of linear additivity between foreground and background representations"},{"citing_arxiv_id":"2605.10756","ref_index":8,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"TINS: Test-time ID-prototype-separated Negative Semantics Learning for OOD Detection","primary_cat":"cs.CV","submitted_at":"2026-05-11T15:54:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TINS improves OOD detection by learning negative semantics at test time with ID-prototype separation, cutting average FPR95 from 14.04% to 6.72% on the Four-OOD benchmark with ImageNet-1K.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"negative semantics, we randomly sample M entries from the merged set to form the updated bank instead of always keeping the most ID-prototype-separated candidates. The buffer is then cleared, i.e.,Q new ←∅. An overview of our method is shown in Figure 2 and summarized in Algorithm 1 of Appendix C. 4 Experiments 4.1 Setup Datasets.We mainly use ImageNet-1K [ 8] as the ID dataset. Following common practice [6, 25, 34, 55], we evaluate on Four-OOD benchmark [7, 46, 53, 62] and additionally report results under the OpenOOD benchmark [58]. We also examine the generality of our method on diverse ID datasets, including Food-101 [4], ImageNet-Sketch [50], ImageNet-R [21], and ImageNet-V2 [41]. Implementation Details."},{"citing_arxiv_id":"2605.09296","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Micro-Defects Expose Macro-Fakes: Detecting AI-Generated Images via Local Distributional Shifts","primary_cat":"cs.CV","submitted_at":"2026-05-10T03:44:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MDMF detects AI-generated images by learning patch-level forensic signatures and quantifying their distributional discrepancies with MMD, yielding larger separation than global methods when micro-defects are present.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"(see Section 2.4). This analysis provides a principled explanation for why aggregating localized evidence at the distribution level leads to reliable separation, even when individual artifacts are weak. We conduct extensive experiments to evaluate the effectiveness and generalization of MDMF. Our evaluation covers widely used benchmarks, including ImageNet [9], LSUN-Bedroom [51], GenImage [61], the in-the-wild WildRF [ 3], and the recent LDMFakeDetect [ 31]. Across them, MDMF consistently achieves strong and stable detection performance, demonstrating robustness to diverse generative architectures and training paradigms. To further stress-test the method, we conduct case studies on OpenSora-generated videos [ 57], where many existing detectors degrade substantially"},{"citing_arxiv_id":"2605.07915","ref_index":16,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"What Matters for Diffusion-Friendly Latent Manifold? Prior-Aligned Autoencoders for Latent Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:52:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prior-Aligned AutoEncoders shape latent manifolds with spatial coherence, local continuity, and global semantics to improve latent diffusion, achieving SOTA gFID 1.03 on ImageNet 256x256 with up to 13x faster convergence.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"[14] Junyu Chen, Dongyun Zou, Wenkun He, Junsong Chen, Enze Xie, Song Han, and Han Cai. Dc-ae 1.5: Accelerating diffusion model convergence with structured latent space, 2025. URL https://arxiv.org/abs/2508.00413. [15] Timothée Darcet, Maxime Oquab, Julien Mairal, and Piotr Bojanowski. Vision transformers need registers.arXiv preprint arXiv:2309.16588, 2023. [16] Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. Imagenet: A large- scale hierarchical image database. In2009 IEEE conference on computer vision and pattern recognition, pages 248-255. Ieee, 2009. [17] Guanfang Dong, Luke Schultz, Negar Hassanpour, and Chao Gao. Repack then refine: Efficient diffusion transformer with vision foundation model, 2026."},{"citing_arxiv_id":"2605.07359","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"UniISP: A Unified ISP Framework for Both Human and Machine Vision","primary_cat":"cs.CV","submitted_at":"2026-05-08T07:13:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniISP unifies ISP processing with a Hybrid Attention Module and Feature Adapter to produce images that are both visually pleasing for humans and informative for computer vision models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"However, these methods lacked consideration of the physical sensor noise in the conversion process from photons to raw images, especially under low-light conditions [26, 31, 46]. Moreover, training from scratch on raw data would forgo the current visual models pre-trained on large-scale sRGB data, especially since existing raw image datasets [34, 57] are far fewer than RGB datasets [9, 25]. Therefore, subsequent research has mainly focused on finding methods to jointly optimize the ISP and backend computer vision models [ 10, 33, 35]. [ 36] designed a sequential CNN model that repeatedly adjusts the hyperparameters of the ISP to adapt to downstream tasks, demonstrating the advantages of this approach. [ 7] considered the physical sensor noise model"},{"citing_arxiv_id":"2605.06905","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Conservative Flows: A New Paradigm of Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-07T20:06:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Conservative flows generate by running probability-preserving stochastic dynamics initialized at data points rather than noise, using corrected Langevin or predictor-corrector mechanisms on top of any pretrained flow model and showing gains on Swiss-roll, ImageNet-256 and Oxford Flowers-102.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06610","ref_index":25,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SoftSAE: Dynamic Top-K Selection for Adaptive Sparse Autoencoders","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:28:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SoftSAE replaces fixed-K sparsity in autoencoders with a learned, input-dependent number of active features via a soft top-k operator.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"the vision setting, we extract image embeddings from a pretrained CLIP encoder (ViT-B/16) and train our model directly on these representations, following the setup of [7]. For the language setting, we operate on post-residual activations from layer 12 of Gemma-2-2B, following standard practices in mechanistic interpretability. For CLIP SAEs, we use CC3M for training, and Imagenet-1k/Imagenet-100 [25] for evaluation. The dictionary size is set to d =4096. All CLIP evaluations are conducted across five different target sparsity levels,k∈ {60,100,140,180,220}. Gemma-2-2B SAEs are trained on the FineWeb dataset [ 26], with the dictionary size set to d = 214 =16384. All Gemma-2-2B evaluations are performed across five different target sparsity levels,"},{"citing_arxiv_id":"2605.06357","ref_index":15,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Memory Efficient Full-gradient Attacks (MEFA) Framework for Adversarial Defense Evaluations","primary_cat":"cs.LG","submitted_at":"2026-05-07T14:35:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MEFA enables exact full-gradient white-box attacks on iterative stochastic purification defenses like diffusion and Langevin EBMs by trading recomputation for lower memory, revealing vulnerabilities missed by approximate-gradient methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05769","ref_index":44,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Adaptive Selection of LoRA Components in Privacy-Preserving Federated Learning","primary_cat":"cs.LG","submitted_at":"2026-05-07T07:01:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AS-LoRA adaptively chooses which LoRA factor to update per layer and round using a curvature-aware second-order score, eliminating reconstruction error floors and improving performance in DP federated learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02003","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RamanBench: A Large-Scale Benchmark for Machine Learning on Raman Spectroscopy","primary_cat":"cs.LG","submitted_at":"2026-05-03T18:12:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RamanBench unifies 74 datasets into the first large-scale reproducible benchmark for ML on Raman spectra, finding tabular foundation models outperform baselines but no method generalizes across datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01711","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Linear-Time Global Visual Modeling without Explicit Attention","primary_cat":"cs.CV","submitted_at":"2026-05-03T04:51:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dynamic parameterization of standard layers can replace explicit attention for linear-time global visual modeling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01330","ref_index":25,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Colinearity Decay: Training Quantization-Friendly ViTs with Outlier Decay","primary_cat":"cs.CV","submitted_at":"2026-05-02T08:49:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Colinearity-Decay regularizer trains ViTs that maintain or improve full-precision accuracy while delivering higher accuracy after low-bit quantization on ImageNet and COCO tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25299","ref_index":8,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"The Thinking Pixel: Recursive Sparse Reasoning in Multimodal Diffusion Latents","primary_cat":"cs.CV","submitted_at":"2026-04-28T07:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A recursive sparse MoE framework integrated into diffusion models iteratively refines visual tokens via gated module selection to improve structured reasoning and image generation performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18094","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Decision-Aware Attention Propagation for Vision Transformer Explainability","primary_cat":"cs.CV","submitted_at":"2026-04-20T11:10:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DAP improves ViT attribution maps by injecting decision-relevant gradients into attention propagation, producing more class-sensitive and faithful explanations than standard attention rollout.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04552","ref_index":3,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"StableTTA: Improving Vision Model Performance by Training-free Test-Time Adaptation Methods","primary_cat":"cs.CV","submitted_at":"2026-04-06T09:21:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StableTTA improves ImageNet-1K accuracy across 71 vision models by stabilizing logit aggregation under coherent-batch inference and enabling efficient single-forward-pass adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.26037","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"CoLLM-NAS: Collaborative Large Language Models for Efficient Knowledge-Guided Neural Architecture Search","primary_cat":"cs.AI","submitted_at":"2025-09-30T10:12:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoLLM-NAS introduces a collaborative two-LLM framework with Navigator, Generator, and Coordinator modules to perform knowledge-guided neural architecture search, reporting state-of-the-art results on ImageNet and NAS-Bench-201 with 4-10x lower search cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.08402","ref_index":13,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"LAION-5B: An open large-scale dataset for training next generation image-text models","primary_cat":"cs.CV","submitted_at":"2022-10-16T00:08:18+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LAION-5B is an openly released dataset of 5.85 billion CLIP-filtered image-text pairs that enables replication of foundational vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}