{"total":52,"items":[{"citing_arxiv_id":"2606.30516","ref_index":61,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"HASTE: A Framework for Training-Free, Dynamic, and Steerable Compression of Pre-Trained Convolutional Neural Networks","primary_cat":"cs.CV","submitted_at":"2026-06-29T16:24:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HASTE enables training-free dynamic compression of pre-trained CNNs by patch-wise LSH-based merging of redundant channels, reporting 46.2% FLOPs reduction on ResNet34 CIFAR-10 with 1.25% accuracy drop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26749","ref_index":262,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Structure Before Collapse: Transient semantic geometry in next-token prediction","primary_cat":"cs.LG","submitted_at":"2026-06-25T08:33:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Semantic geometry emerges transiently early in next-token prediction training before collapsing to Neural Collapse symmetry in synthetic settings with latent semantic factors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26719","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Full spectrum Unlearnable Examples via Spectral Equalization","primary_cat":"cs.CV","submitted_at":"2026-06-25T07:53:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FUSE creates full-spectrum unlearnable perturbations using random spectral masking during training and cross-band guidance to enforce consistency between frequency components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28401","ref_index":94,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Vision-driven Preference Synthesis for Mitigating Hallucinations in VLMs","primary_cat":"cs.CV","submitted_at":"2026-06-24T11:06:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViPSy constructs policy-aligned and visually grounded preference pairs for VLMs via visual cues from image variants, yielding SOTA hallucination reductions of 35.7% on AMBER and 24.5% on Object HalBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21590","ref_index":41,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Radial Basis Function Networks as Projection Heads in Self-Supervised Learning","primary_cat":"cs.CV","submitted_at":"2026-06-19T16:46:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RBFN projection heads serve as competitive replacements for MLP heads in SSL and enable SNS, a label-free metric from RBF parameters that correlates strongly with logistic regression evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11827","ref_index":41,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Jaguar: Fast Private CNN Inference with Power-of-Two Homomorphic Arithmetic","primary_cat":"cs.CR","submitted_at":"2026-06-10T09:04:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Jaguar replaces prime-modulus HE with power-of-two arithmetic to enable coefficient-domain convolution and local-shift truncation, reporting 2-3.7x lower latency than Cheetah and Rhombus on ResNet-18/50 and MobileNetV2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08833","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CSFlow: Aligning Flow Matching with Human Contrast Sensitivity","primary_cat":"cs.CV","submitted_at":"2026-06-07T20:52:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CSFlow derives inference-time timestep weights for flow matching by matching per-step frequency content to human CSF, yielding 4.7% FID reduction and smaller gains on IS and GenEval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05471","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Formal Concept Lattices are Good Semantic Scaffolds for Concept-Based Learning","primary_cat":"cs.CV","submitted_at":"2026-06-03T21:50:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Formal concept lattices guide staged, hierarchical concept learning in deep networks to produce more interpretable and semantically structured representations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00746","ref_index":169,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Scaling Parallel Sequence Models to Foundation-Scale Vision Encoders","primary_cat":"cs.CV","submitted_at":"2026-05-30T14:29:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"C-GSPN scales 2D spatial propagation to foundation vision encoders via a fast CUDA kernel, compressed blocks, and two-stage distillation, matching ViT performance with 15% fewer parameters and 4x block speedup at 2K resolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28966","ref_index":33,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"The Trust Paradox: How CS Researchers Engage LLM Leaderboards","primary_cat":"cs.CL","submitted_at":"2026-05-27T18:09:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CS researchers show pragmatic skepticism toward LLM leaderboards, using them despite distrust while preferring peer networks, arena leaderboards, and cost transparency as key missing feature.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23033","ref_index":57,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Uncovering the Latent Potential of Deep Intermediate Representations","primary_cat":"cs.LG","submitted_at":"2026-05-21T20:58:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces LOES, a constructive spectral method to select task-discriminative subspaces from intermediate layer embeddings, and GeoReg for enforcing simplicial class geometry during fine-tuning, with reported gains increasing with model depth across modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22200","ref_index":67,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"OSS: Open Suturing Skills Vision-Based Assessment Challenge 2024-2025","primary_cat":"cs.CV","submitted_at":"2026-05-21T09:04:17+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The OSS Challenge provides benchmarks showing spatiotemporal video models excel at open suturing skill classification and OSATS scoring but struggle with keypoint tracking under occlusion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22132","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Accelerating Vision Foundation Models with Drop-in Depthwise Convolution","primary_cat":"cs.CV","submitted_at":"2026-05-21T08:07:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Replacing selected attention heads in pretrained ViTs with depthwise convolutions, identified by simple strategies and recovered via fine-tuning, delivers 17-20% inference speedup on image tasks with minimal accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22011","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Rethinking Token Reduction for Diffusion Models via Output-Similarity-Awareness","primary_cat":"cs.CV","submitted_at":"2026-05-21T05:18:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DiTo shifts token reduction in DiTs to output token similarity, reusing prior-step matches across timesteps with PMR scheduling and frequency-aware penalties to raise PSNR at given speedups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21143","ref_index":115,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CoarseSoundNet: Building a reliable model for ecological soundscape analysis","primary_cat":"cs.SD","submitted_at":"2026-05-20T13:18:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper introduces CoarseSoundNet, a deep learning model for classifying biophony, geophony, and anthropophony in passive acoustic monitoring recordings, reporting performance gains from additional similar data, a silence class, and decision thresholds, plus a case study on acoustic index trends.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20871","ref_index":128,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Robustness Analysis of USmorph: II. Optimizing Feature Extraction, Dimensionality Reduction, and Clustering for Unsupervised Galaxy Morphology Classification","primary_cat":"astro-ph.GA","submitted_at":"2026-05-20T08:08:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Optimizes ImageNet-pretrained AlexNet, UMAP, and a bagging multi-cluster voting scheme with K-means, Birch and Agg for unsupervised galaxy morphology classification, reporting improved stability and consistency with galaxy evolution expectations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20347","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Symmetrization of Loss Functions for Robust Training of Neural Networks in the Presence of Noisy Labels","primary_cat":"cs.LG","submitted_at":"2026-05-19T18:03:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Symmetrizing cross-entropy produces the unique convex multi-class unhinged loss, which locally approximates other symmetric losses, and enables new interpolating losses SGCE and alpha-MAE with competitive performance on noisy-label benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19371","ref_index":29,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Multi-Scale Generative Modeling with Heat Dissipation Flow Matching","primary_cat":"cs.CV","submitted_at":"2026-05-19T05:08:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HDFM adds a continuous heat-dissipation (blur) process to flow matching, aligns an interpolated path to fix ill-posed inverse heat dissipation, and uses x-prediction to ease high-dimensional regression, yielding better performance than most baselines on image datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16468","ref_index":74,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Mechanistically Interpretable Neural Encoding Reveals Fine-Grained Functional Selectivity in Human Visual Cortex","primary_cat":"cs.CV","submitted_at":"2026-05-15T11:28:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MINE uses mechanistic interpretability on language-aligned image representations to generate per-voxel feature descriptions, validated via image generation and counterfactual edits that causally shift brain activation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13842","ref_index":266,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"From DES to KiDS: Domain adaptation for cross-survey detection of low-surface-brightness galaxies","primary_cat":"astro-ph.GA","submitted_at":"2026-05-13T17:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Domain adaptation with an ensemble of CNN and transformer models trained on DES detects 20,180 LSBGs and 434 UDGs in KiDS DR5, with structural parameters and environmental trends consistent with known samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12967","ref_index":60,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ImageAttributionBench: How Far Are We from Generalizable Attribution?","primary_cat":"cs.CV","submitted_at":"2026-05-13T04:01:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ImageAttributionBench is a benchmark dataset demonstrating that state-of-the-art image attribution methods lack robustness to image degradation and fail to generalize to semantically disjoint domains.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"1 Introduction In recent years, the emergence and evolution of auto-regressive models [18, 44, 66] and diffusion models [32, 61, 16, 50, 58, 55, 56] have led to AI-generated content (AIGC) becoming increasingly realistic and widely applied across industries, bringing convenience to fields such as entertainment [51, 2, 63], advertising [ 39, 17], and medicine [ 60, 83]. This progress is particularly evident in AI- synthesized images, which have seen gradual improvements in resolution and semantic consistency, accompanied by more accessible generation methods for users. However, issues such as the spread of misinformation [74, 62, 81], privacy violations [48, 35], and fraud [27, 70] have become more serious. Therefore, the importance and necessity of research on"},{"citing_arxiv_id":"2605.12952","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Debunking Grad-ECLIP: A Comprehensive Study on Its Incorrectness and Fundamental Principles for Model Interpretation","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:35:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Grad-ECLIP is an equivalent but flawed variant of attention-based interpretation, with two principles proposed to ensure model explanations reflect the original model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08976","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Score-Based Generative Modeling through Anisotropic Stochastic Partial Differential Equations","primary_cat":"cs.CE","submitted_at":"2026-05-09T14:36:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anisotropic SPDEs preserve geometric data structure over longer timescales in score-based generative modeling, yielding better image quality than standard SDE baselines and flow matching in unconditional and conditional tasks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Finally, ifg 1 andg 2 both do not depend on the second argument, (3) is a linear parabolic SPDE with additive noise: dUt =α 1(t)∆Ut dt+α 2(t) dWt for allt∈I.(20) I Numerical simulation For the numerical simulation of the forward and backward processes, (3) and (1), we modeled the image space Λ as Λ = (0, d1)×(0, d 2)and decomposed the boundary∂Λaccording to ∂LΛ :={0} ×[0, d 2);(21) ∂T Λ := [0, d1)× {d 2};(22) ∂RΛ :={d 1} ×(0, d 2];(23) ∂BΛ := (0, d1]× {0}(24) into its left, top, right and bottom part. We discretized the derivatives using a mixture of forward, backward and central finite differences, respecting Neumann boundary conditions. I.1 Domain discretization After discretization, we decomposed the discretized domain D={0, . . . , d 1} × {0, . . . , d2} in the same spirit into its interior,"},{"citing_arxiv_id":"2605.08731","ref_index":13,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Single-Thread JPEG Decoder Benchmarks Mis-Evaluate ML Data Loaders","primary_cat":"cs.PF","submitted_at":"2026-05-09T06:34:17+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Single-thread JPEG benchmarks misrank decoders for ML DataLoader use, with rankings changing across CPUs and worker counts; torchvision and simplejpeg perform best in measured DataLoader tiers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06987","ref_index":233,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Response Time Enhances Alignment with Heterogeneous Preferences","primary_cat":"cs.LG","submitted_at":"2026-05-07T22:05:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Response times modeled as drift-diffusion processes enable consistent estimation of population-average preferences from heterogeneous anonymous binary choices.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06207","ref_index":50,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Taming the Entropy Cliff: Variable Codebook Size Quantization for Autoregressive Visual Generation","primary_cat":"cs.CV","submitted_at":"2026-05-07T13:13:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Variable codebook sizes that increase along the sequence in visual tokenizers reduce generation FID scores significantly for autoregressive models on ImageNet.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05331","ref_index":10,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ViTok-v2: Scaling Native Resolution Auto-Encoders to 5 Billion Parameters","primary_cat":"cs.CV","submitted_at":"2026-05-06T18:03:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViTok-v2 is a 5B-parameter native-resolution image autoencoder using NaFlex and DINOv3 loss that matches or exceeds prior tokenizers at 256p and outperforms them at 512p and above while advancing the Pareto frontier in joint scaling with generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03812","ref_index":45,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GPUBreach: Privilege Escalation Attacks on GPUs using Rowhammer","primary_cat":"cs.CR","submitted_at":"2026-05-05T14:40:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Unprivileged CUDA kernels can use Rowhammer to tamper with GPU page tables for targeted privilege escalation, leaking cryptographic keys and escalating to CPU root access by bypassing IOMMU.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02109","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Detecting Adversarial Data via Provable Adversarial Noise Amplification","primary_cat":"cs.LG","submitted_at":"2026-05-04T00:08:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A provable adversarial noise amplification theorem under sufficient conditions enables a custom-trained detector that identifies adversarial examples at inference time using enhanced layer-wise noise signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27704","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A generalised pre-training strategy for deep learning networks in semantic segmentation of remotely sensed images","primary_cat":"cs.CV","submitted_at":"2026-04-30T10:48:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A novel pre-training strategy for ImageNet-initialized models achieves state-of-the-art semantic segmentation performance on four remote sensing datasets (iSAID, MFNet, PST900, Potsdam) by reducing domain-specific feature learning during pre-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20268","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Opportunistic Bone-Loss Screening from Routine Knee Radiographs Using a Multi-Task Deep Learning Framework with Sensitivity-Constrained Threshold Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-22T07:12:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"STR-Net achieves AUROC of 0.933 for binary bone-loss screening and 0.801 correlation for T-score estimation from knee X-rays on a held-out test set.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12780","ref_index":46,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Efficient Adversarial Training via Criticality-Aware Fine-Tuning","primary_cat":"cs.CV","submitted_at":"2026-04-14T14:17:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAAT selects critical parameters for adversarial robustness in ViTs and applies PEFT to tune only those, yielding a 4.3% robustness drop versus full AT while using ~6% of parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11720","ref_index":29,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"On the Robustness of Watermarking for Autoregressive Image Generation","primary_cat":"cs.CV","submitted_at":"2026-04-13T16:56:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Watermarking schemes for autoregressive image generation fail against removal and forgery attacks, enabling false detections and undermining synthetic content filtering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11043","ref_index":42,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"EmergentBridge: Improving Zero-Shot Cross-Modal Transfer in Unified Multimodal Embedding Models","primary_cat":"cs.AI","submitted_at":"2026-04-13T06:15:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EmergentBridge enhances zero-shot cross-modal performance on unpaired modalities by learning noisy bridge anchors from existing alignments and enforcing proxy alignment only in the orthogonal subspace to avoid gradient interference.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ImageNet Large Scale Visual Recognition Challenge.International Journal of Computer Vision (IJCV)115, 3 (2015), 211-252. doi:10.1007/s11263-015-0816-y [41] Shuran Song, Samuel P Lichtenberg, and Jianxiong Xiao. 2015. Sun rgb-d: A rgb-d scene understanding benchmark suite. InProceedings of the IEEE conference on computer vision and pattern recognition. 567-576. [42] Alex Tamkin, Mike Wu, and Noah D. Goodman. 2020. Viewmaker Networks: Learning Views for Unsupervised Representation Learning.ArXivabs/2010.07432 (2020). https://api.semanticscholar.org/CorpusID:222381644 [43] Yonglong Tian, Dilip Krishnan, and Phillip Isola. 2020. Contrastive multiview coding. InComputer Vision-ECCV 2020: 16th European Conference, Glasgow, UK,"},{"citing_arxiv_id":"2604.08230","ref_index":97,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Generalization Under Scrutiny: Cross-Domain Detection Progresses, Pitfalls, and Persistent Challenges","primary_cat":"cs.CV","submitted_at":"2026-04-09T13:21:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey that organizes methods for cross-domain object detection into a taxonomy, analyzes domain shift across detection stages, and outlines persistent challenges.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"T able 4Common datasets used in CDOD benchmarks, summarizing modality, scale, annotation volume, typical role, and dominant shift type. Acronyms: S = Source, T = Target. Symbol:∼ indicates approximate counts. Dataset Y ear Modality #Images #Cls #Anno Role Domain Shift PASCAL VOC [95] 2007-2012 RGB∼16.5K∼20∼40K S/T mild scene shift MS COCO [96] 2014 RGB∼330K∼80∼2.5M S scene diversity ImageNet DET [97] 2013 RGB∼450K∼200∼500K S fine-grained cate- gory Cityscapes [98] 2016 RGB∼3.0K∼8∼65K T urban scene shift Foggy Cityscapes [99] 2018 RGB∼3.0K∼8∼65K T weather (clear→fog) SIM10K [100] 2018 RGB (Synthetic)∼10K∼1∼58K S synth→real GTA5 [101] 2016 RGB (Synthetic)∼25K∼9∼300K S synth→real SYNTHIA [102] 2016 RGB (Synthetic)∼9.4K∼9∼200K S synth→real BDD100K [103] 2020 RGB / Video∼100K∼10∼1."},{"citing_arxiv_id":"2604.06783","ref_index":68,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Insights from Visual Cognition: Understanding Human Action Dynamics with Overall Glance and Refined Gaze Transformer","primary_cat":"cs.CV","submitted_at":"2026-04-08T07:52:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The OG-ReG Transformer achieves state-of-the-art results on Kinetics-400, Something-Something v2, and Diving-48 by combining global glance and local gaze processing paths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.20779","ref_index":50,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CHiQPM: Calibrated Hierarchical Interpretable Image Classification","primary_cat":"cs.LG","submitted_at":"2025-11-25T19:16:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CHiQPM is a hierarchical interpretable image classifier that maintains 99% of non-interpretable model accuracy while supplying contrastive global explanations, human-like hierarchical paths, and calibrated interpretable set predictions via conformal prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.01706","ref_index":12,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Representational Alignment Across Model Layers and Brain Regions with Multi-Level Optimal Transport","primary_cat":"cs.LG","submitted_at":"2025-10-02T06:25:06+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Multi-Level Optimal Transport (MOT) jointly infers soft layer couplings and neuron transport plans to produce global alignment scores and structured hierarchical correspondences between networks of varying depths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.14165","ref_index":62,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Where Do Tokens Go? Understanding Pruning Behaviors in STEP at High Resolutions","primary_cat":"cs.CV","submitted_at":"2025-09-17T16:48:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"STEP uses dynamic superpatch merging via dCTS and early token exits to cut token count by 2.5x and computational complexity by up to 4x on ViT-Large for high-res segmentation, with at most 2% accuracy drop and 40% tokens halted early.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.06656","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ClusterMark: Towards Robust Watermarking for Autoregressive Image Generators with Visual Token Clustering","primary_cat":"cs.CV","submitted_at":"2025-08-08T19:14:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ClusterMark applies visual token clustering to create robust in-generation watermarks for autoregressive image models, improving detectability under perturbations compared to direct token biasing while preserving quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.07776","ref_index":57,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SCOOTER: A Human Evaluation Framework for Unrestricted Adversarial Examples","primary_cat":"cs.CV","submitted_at":"2025-07-10T13:56:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SCOOTER supplies best-practice guidelines, open tools, and a 3K-image benchmark with 34K+ human ratings showing that six tested unrestricted attacks produce images humans can detect as fake.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.04941","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TOAST: Transformer Optimization using Adaptive and Simple Transformations","primary_cat":"cs.LG","submitted_at":"2024-10-07T11:35:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TOAST approximates full transformer blocks in pretrained models via lightweight closed-form mappings to cut parameters and FLOPs without retraining or finetuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.06197","ref_index":22,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Complex Facial Expression Recognition Using Deep Knowledge Distillation of Basic Features","primary_cat":"cs.CV","submitted_at":"2023-08-11T15:42:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Continual learning via knowledge distillation achieves SOTA 74.28% accuracy on new compound facial expression classes and 100% in one-shot learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.08402","ref_index":68,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LAION-5B: An open large-scale dataset for training next generation image-text models","primary_cat":"cs.CV","submitted_at":"2022-10-16T00:08:18+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LAION-5B is an openly released dataset of 5.85 billion CLIP-filtered image-text pairs that enables replication of foundational vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.09236","ref_index":73,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"RGB-D image-based Object Detection: from Traditional Methods to Deep Learning Techniques","primary_cat":"cs.CV","submitted_at":"2019-07-22T11:18:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A survey of RGB-D object detection from traditional hand-crafted features with machine learning to deep learning techniques.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.00382","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Adversarially Trained Deep Neural Semantic Hashing Scheme for Subjective Search in Fashion Inventory","primary_cat":"cs.CV","submitted_at":"2019-06-30T13:59:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Adversarial deep semantic hashing for fashion retrieval achieves 90.65% mAP, outperforming prior deep Cauchy hashing at 53.26%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.00103","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Learning Effective Loss Functions Efficiently","primary_cat":"cs.LG","submitted_at":"2019-06-28T22:35:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An anytime algorithm for learning loss functions that is asymptotically optimal in the worst case and experimentally faster than prior methods for hyperparameter tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1906.11979","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Utility-Preserving GAN for Face Obscuration","primary_cat":"cs.CV","submitted_at":"2019-06-27T22:01:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UP-GAN uses a GAN to obscure faces while preserving utility attributes like age, gender, pose, and expression better than blurring or pixelation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1906.10822","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Gradient Noise Convolution (GNC): Smoothing Loss Function for Distributed Large-Batch SGD","primary_cat":"cs.LG","submitted_at":"2019-06-26T02:54:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GNC convolves stochastic gradient noise to smooth sharp minima in large-batch SGD, outperforming isotropic noise for better generalization in distributed deep learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1906.09868","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Pose Estimation for Non-Cooperative Rendezvous Using Neural Networks","primary_cat":"cs.CV","submitted_at":"2019-06-24T11:51:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SPN is a CNN that detects a spacecraft bounding box, classifies then regresses attitude, and optimizes position via Gauss-Newton, achieving degree-level attitude and cm-level position errors on real images after training only on synthetic data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}