{"total":27,"items":[{"citing_arxiv_id":"2605.23282","ref_index":1,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Discontinuous Galerkin Neural Operator for Pathology Defocus Deblurring","primary_cat":"eess.IV","submitted_at":"2026-05-22T06:50:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DGNO parameterizes integral kernels with discontinuous Galerkin elements for heterogeneous defocus deblurring in pathology images and reports superior performance over prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20751","ref_index":62,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"PACD-Net: Pseudo-Augmented Contrastive Distillation for Glycemic Control Estimation from SMBG","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:50:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PACD-Net uses pseudo-augmented contrastive distillation with a hybrid Swin Transformer-CNN backbone to estimate TAR, TIR, and TBR from sparse SMBG data and outperforms prior methods in accuracy and stability under sparse conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20738","ref_index":204,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"STAR-IOD: Scale-decoupled Topology Alignment with Pseudo-label Refinement for Remote Sensing Incremental Object Detection","primary_cat":"cs.CV","submitted_at":"2026-05-20T05:43:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"STAR-IOD applies scale-decoupled topology alignment and K-Means-based pseudo-label refinement to reduce catastrophic forgetting in remote sensing incremental object detection, reporting 1.7% and 2.1% mAP gains on new DIOR-IOD and DOTA-IOD datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19822","ref_index":113,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"ST-TGExplainer: Disentangling Stability and Transition Patterns for Temporal GNN Interpretability","primary_cat":"cs.LG","submitted_at":"2026-05-19T13:16:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ST-TGExplainer disentangles stability and transition patterns in temporal graphs via a self-explainable TGNN guided by a disentangled information bottleneck objective to produce more faithful explanations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18156","ref_index":82,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Semi-LAR: Semi-supervised Contrastive Learning with Linear Attention for Removal of Nighttime Flares","primary_cat":"cs.CV","submitted_at":"2026-05-18T10:02:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Semi-LAR is a semi-supervised contrastive learning framework with linear attention for nighttime flare removal that refines pseudo-labels via quality assessment and uses flare-aware patch-level contrastive losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17837","ref_index":110,"ref_count":2,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Temporal Aware Pruning for Efficient Diffusion-based Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T04:18:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TAPE applies temporal-aware token pruning with smoothing, reselection, and timestep scheduling to speed up video diffusion models while preserving visual fidelity and coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15684","ref_index":72,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"ElasticDiT: Efficient Diffusion Transformers via Elastic Architecture and Sparse Attention for High-Resolution Image Generation on Mobile Devices","primary_cat":"cs.CV","submitted_at":"2026-05-15T07:13:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ElasticDiT introduces an elastic DiT architecture with adjustable spatial compression and block depth plus Shift Sparse Block Attention and a distilled VAE to enable a single model to cover multiple fidelity-latency points for high-resolution image generation on mobile devices.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14689","ref_index":121,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Are Candidate Models Really Needed for Active Learning?","primary_cat":"cs.CV","submitted_at":"2026-05-14T11:03:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Active learning with randomly initialized models achieves comparable results to traditional candidate-model methods, with low-confidence sampling proving most effective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11800","ref_index":4,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"ROMER: Expert Replacement and Router Calibration for Robust MoE LLMs on Analog Compute-in-Memory Systems","primary_cat":"cs.LG","submitted_at":"2026-05-12T08:57:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ROMER cuts perplexity by up to 59% in noisy analog CIM environments for MoE LLMs via expert replacement and router recalibration calibrated on real-chip measurements.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"whereE dev is a noise matrix with i.i.d. entries drawn fromN(0,σ 2 dev). ADC Quantization Noise.The analog current accumulation along each bitline must be converted to a digital value by an analog-to-digital converter (ADC). Finite ADC resolution introduces quantization error, which we model as additive uniform noise on the output: ˜yADC =˜ydev +ϵ ADC,ϵ ADC,i ∼ U \u0012 − ∆ 2 , ∆ 2 \u0013 , (4) where ∆=V ref/(2b − 1) is the quantization step size determined by the ADC reference voltageV ref and bit-widthb. 3 Preprint. Under review. Figure 3: Bar chart(left) showing the Perplexity observed on both OLMOE-7B-A1B and Qwen3-30B-A3B models when applying varying degrees of perturbation to their expert selection functions under noisy conditions and expert activation heatmaps under clean"},{"citing_arxiv_id":"2605.11434","ref_index":25,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"FEFormer: Frequency-enhanced Vision Transformer for Generic Knowledge Extraction and Adaptive Feature Fusion in Volumetric Medical Image Segmentation","primary_cat":"eess.IV","submitted_at":"2026-05-12T02:32:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A frequency-enhanced Vision Transformer with FDSA, FGMLP, WAFF, and FCSB modules delivers superior volumetric medical image segmentation performance and efficiency over prior state-of-the-art methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"the input features𝑿 1 and𝑿 2 (𝑿1,𝑿 2 ∈ℝ 𝐶×𝐻×𝑊×𝐷 ) into eight sub-bands in the frequency domain by separat- ing low-frequency and high-frequency signals along three dimensions{𝑿 LLL 1 ,𝑿 LLH 1 , ...,𝑿 HHH 1 } ∈ℝ 𝐶× 𝐻 2 × 𝑊 2 × 𝐷 2 and {𝑿LLL 2 ,𝑿 LLH 2 , ...,𝑿 HHH 2 } ∈ℝ 𝐶× 𝐻 2 × 𝑊 2 × 𝐷 2 , respectively, as {𝑿LLL 1 ,𝑿 LLH 1 , ...,𝑿 HHH 1 } =DWT(𝑿 1),(25) {𝑿LLL 2 ,𝑿 LLH 2 , ...,𝑿 HHH 2 } =DWT(𝑿 2).(26) Each sub-band represents different semantics, and take𝑿1 for example: •Sub-band𝑿 LLL 1 represents pure low-frequency se- mantic features. •Sub-bands{𝑿 LLH 1 ,𝑿 LHL 1 ,𝑿 HLL 1 }represent relatively higher-frequency semantic features related to edges and boundaries along three dimensions(𝐷, 𝑊 , 𝐻). •Sub-bands{𝑿 LHH"},{"citing_arxiv_id":"2605.16384","ref_index":96,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Mutual Enhancement Between Global Tokens and Patch Tokens: From Theory to Practice","primary_cat":"cs.CV","submitted_at":"2026-05-11T10:51:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TaTok is a theoretically grounded adaptive tokenization method that uses global tokens and cumulative conditional entropy filtering to reduce redundancy while improving reconstruction quality over fixed-rate patch tokenization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16383","ref_index":3,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"A neurosymbolic Approach with Epistemic Deep Learning for Hierarchical Image Classification","primary_cat":"cs.CV","submitted_at":"2026-05-11T09:43:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A neurosymbolic model augments Swin Transformers with focal sets and fuzzy logic to produce calibrated hierarchical image classifications that respect logical constraints.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Given these focal-set families, the model predicts belief values ˆBel f (A)and ˆBel c (B)for allA∈O f andB∈O c using sigmoid activations on the output logitsy f A, yc B: ˆBel f (A) =σ(y f A), ˆBel c (B) =σ(y c B), σ(x) = 1 1+e−x .(2) Mass functions are then obtained through the restricted M¨obius inversion used in RS-NN: mf(A) = X B⊆A B∈O f (−1)|A|−|B| ˆBel f (B), m c(B) = X D⊆B D∈Oc (−1)|B|−|D| ˆBel c (D).(3) with soft regularisation encouraging non-negativity and normalisation of the predicted masses. Mass validity is encouraged through standard RS-NN penalties: Rf mass = X A∈Of max(0,−m f(A)),R c mass = X B∈O c max(0,−m c(B)),(4) Rf sum = max  0, X A∈Of mf(A)−1   ,R c sum = max 0, X B∈O c mc(B)−1 ! ,(5) with remaining mass placed on the whole collection of labels, respectivelyYf andY c."},{"citing_arxiv_id":"2605.10203","ref_index":55,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Polyphonia: Zero-Shot Timbre Transfer in Polyphonic Music with Acoustic-Informed Attention Calibration","primary_cat":"cs.SD","submitted_at":"2026-05-11T08:49:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Polyphonia improves zero-shot stem-specific timbre transfer in polyphonic music by 15.5% target alignment via acoustic-informed attention calibration that uses probabilistic priors to set coarse boundaries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10183","ref_index":53,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Fix the Loss, Not the Radius: Rethinking the Adversarial Perturbation of Sharpness-Aware Minimization","primary_cat":"cs.LG","submitted_at":"2026-05-11T08:34:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LE-SAM inverts SAM by fixing the loss budget instead of the parameter-space radius, yielding better generalization across benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09472","ref_index":33,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Positional LSH: Binary Block Matrix Approximation for Attention with Linear Biases","primary_cat":"cs.LG","submitted_at":"2026-05-10T10:58:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ALiBi bias is the expectation of positional LSH-induced block masks, yielding spectral and max-norm approximation bounds that reduce long-context biased attention to randomized short-context unbiased attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08935","ref_index":183,"ref_count":2,"confidence":0.35,"is_internal_anchor":false,"paper_title":"PnP-Corrector: A Universal Correction Framework for Coupled Spatiotemporal Forecasting","primary_cat":"cs.AI","submitted_at":"2026-05-09T13:12:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08574","ref_index":58,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Post-hoc Selective Classification for Reliable Synthetic Image Detection","primary_cat":"cs.CV","submitted_at":"2026-05-09T00:25:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReSIDe generalizes logit-based confidence scores to intermediate layers of synthetic image detectors and uses preference optimization to aggregate them, cutting area under the risk-coverage curve by up to 69.55% under covariate shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08505","ref_index":29,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Scaling Limits of Long-Context Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-08T21:39:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"For uniform keys on the d-dimensional sphere, softmax attention becomes selective at inverse temperature scaling β_n* ≍ n^{2/(d-1)}, with explicit limiting laws for attention weights and outputs in each regime.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06380","ref_index":16,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Empirical Evidence for Simply Connected Decision Regions in Image Classifiers","primary_cat":"cs.CV","submitted_at":"2026-05-07T14:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Empirical tests with quad-mesh filling indicate that decision regions in modern image classifiers are simply connected.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05668","ref_index":71,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Large Vision-Language Models Get Lost in Attention","primary_cat":"cs.AI","submitted_at":"2026-05-07T04:45:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"In LVLMs, attention can be replaced by random Gaussian weights with little or no performance loss, indicating that current models get lost in attention rather than efficiently using visual context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05026","ref_index":74,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Local Intrinsic Dimension Unveils Hallucinations in Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-06T15:22:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hallucinations in diffusion models are driven by local intrinsic dimension instabilities on the manifold, which Intrinsic Quenching corrects by deflating it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04682","ref_index":17,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"HEXST: Hexagonal Shifted-Window Transformer for Spatial Transcriptomics Gene Expression Prediction","primary_cat":"cs.LG","submitted_at":"2026-05-06T09:34:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HEXST applies a hexagonal shifted-window Transformer with rotary positional encodings, contrast-sensitive training objectives, and single-cell priors to predict gene expression from histology slides, outperforming prior models on seven datasets while preserving spatial heterogeneity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01466","ref_index":24,"ref_count":2,"confidence":0.35,"is_internal_anchor":false,"paper_title":"SplAttN: Bridging 2D and 3D with Gaussian Soft Splatting and Attention for Point Cloud Completion","primary_cat":"cs.CV","submitted_at":"2026-05-02T14:34:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SplAttN uses Gaussian soft splatting and attention to avoid sparse projection collapse in point cloud completion, achieving SOTA results and demonstrating genuine visual cue reliance on KITTI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19724","ref_index":32,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Benign Overfitting in Adversarial Training for Vision Transformers","primary_cat":"cs.LG","submitted_at":"2026-04-21T17:48:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Adversarial training on simplified Vision Transformers achieves benign overfitting with near-zero robust loss and generalization error when signal-to-noise ratio and perturbation budget meet specific conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19028","ref_index":200,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Learning Posterior Predictive Distributions for Node Classification from Synthetic Graph Priors","primary_cat":"cs.LG","submitted_at":"2026-04-21T03:23:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NodePFN pre-trains on synthetic graphs with controllable homophily and causal feature-label models to achieve 71.27 average accuracy on 23 node classification benchmarks without graph-specific training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18452","ref_index":37,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"ESsEN: Training Compact Discriminative Vision-Language Transformers in a Low-Resource Setting","primary_cat":"cs.CV","submitted_at":"2026-04-20T16:10:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ESsEN is a parameter-efficient two-tower vision-language transformer that matches larger models on discriminative tasks after training end-to-end with limited data and resources.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.12508","ref_index":128,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"SalUn: Empowering Machine Unlearning via Gradient-based Weight Saliency in Both Image Classification and Generation","primary_cat":"cs.LG","submitted_at":"2023-10-19T06:17:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SalUn uses gradient-based weight saliency to achieve effective machine unlearning of data, classes, or concepts in image classification and generation, narrowing the gap to exact retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}