{"total":19,"items":[{"citing_arxiv_id":"2606.09746","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hybrid Robustness Verification for Spatio-Temporal Neural Networks","primary_cat":"cs.CV","submitted_at":"2026-06-08T17:06:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"STBP computes exact closed-form bounds for the first convolutional layer of spatio-temporal networks and propagates scalable approximations through the rest to certify robustness under subset-frame or patch perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09246","ref_index":22,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SOMA: From Surface Observations to Muscle Anatomy","primary_cat":"cs.CV","submitted_at":"2026-06-08T09:20:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOMA recovers spatio-temporal muscle behavior from multi-view RGB surface data and introduces the SKIM soft-tissue deformation dataset as the first such method from RGB observations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09081","ref_index":63,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Edge-Constrained UAV Small-Object Detection with P2 Enhancement and Quantum-Inspired Lightweight Structure Search","primary_cat":"cs.CV","submitted_at":"2026-06-08T06:27:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Adding a P2 branch to YOLOX-Nano raises small-object AP by 31.10% on VisDrone; QIEA screens structures balancing accuracy, FLOPs, latency, memory and recall.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03490","ref_index":76,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TrAction: Action Recognition with Sparse Trajectories","primary_cat":"cs.CV","submitted_at":"2026-06-02T11:07:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparse 2.5D trajectory transformers with masked pretraining reach 45% top-1 on Something-Something V2 and 54% on EPIC-Kitchens while improving fusion with DINOv2 and V-JEPA by up to 8.7 points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02569","ref_index":28,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"AdaCodec: A Predictive Visual Code for Video MLLMs","primary_cat":"cs.CV","submitted_at":"2026-06-01T17:56:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdaCodec introduces a predictive visual code that cuts visual token use in video MLLMs by sending full frames only on high predictive cost and otherwise encoding inter-frame changes as P-tokens, yielding better benchmark scores at lower budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23458","ref_index":46,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"One-Forcing: Towards Stable One-Step Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T10:16:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"One-Forcing augments DMD with a GAN loss to enable stable one-step causal autoregressive video generation, reporting a VBench score of 83.76 as SOTA among one-step methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19091","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Chessformer: A Unified Architecture for Chess Modeling","primary_cat":"cs.LG","submitted_at":"2026-05-18T20:27:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Chessformer is a unified encoder-only transformer for chess that uses square tokens, geometric attention bias, and an attention-based policy head to set new records in human move prediction accuracy, playing strength, and interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17160","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"When Bits Break Recourse: Counterfactual-Faithful Quantization","primary_cat":"cs.LG","submitted_at":"2026-05-16T21:19:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CFQ trains quantizer parameters and mixed-precision allocation to preserve counterfactual recourse validity, cost, and direction on Adult, German Credit, and COMPAS while matching accuracy of standard quantizers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14145","ref_index":37,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Rethinking the Good Enough Embedding for Easy Few-Shot Learning","primary_cat":"cs.CV","submitted_at":"2026-05-13T21:52:05+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Frozen DINOv2-L features with k-NN classification and PCA/ICA refinement achieve state-of-the-art few-shot performance on four benchmarks without any backpropagation or fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11530","ref_index":25,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Multi-Narrow Transformation as a Single-Model Ensemble: Boundary Conditions, Mechanisms, and Failure Modes","primary_cat":"cs.LG","submitted_at":"2026-05-12T04:54:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Multi-narrow single-model ensembles outperform wide baselines in low-data image classification by learning diverse features but underperform in data-rich settings where training favors few paths.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"that SW and high-MN differ not only in capacity allocation but also in optimization sensitivity. 4.3.3. Robustness across architectures and datasets We next examine whether the observed data-regime dependenceisspecifictoaparticularmodelordataset.First, Fig.4showstherobustnessofthetrendacrossarchitectures, including ConvNeXt-Tiny [17], EfficientNet-B0 [25], Mo- bileNetV2 [23], RegNetY-400MF [22], and Wide ResNet- 50 [31]. Althoughtheexacttransitionpointvariesacrossmodels, manyarchitecturesexhibitthesameoveralltendency:higher MN strength becomes preferable as the amount of training data decreases. In particular, below IPC=100, high-MN configurations are often the best-performing or among the best-performing models."},{"citing_arxiv_id":"2605.06127","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Continuous Expert Assembly: Instance-Conditioned Low-Rank Residuals for All-in-One Image Restoration","primary_cat":"cs.CV","submitted_at":"2026-05-07T12:31:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CEA assembles per-token low-rank residual updates via dense affinities over hyper-adapter-generated components to improve all-in-one image restoration on spatially non-uniform degradations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"arXiv preprint arXiv:2306.13653, 2023. doi: 10.48550/ARXIV .2306.13653. [34] Ben Mildenhall, Jonathan T. Barron, Jiawen Chen, Dillon Sharlet, Ren Ng, and Robert Carroll. Burst denoising with kernel prediction networks. In2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pages 2502-2510, 2018. doi: 10.1109/CVPR.2018. 00265. [35] Chong Mou, Qian Wang, and Jian Zhang. Deep generalized unfolding networks for image restoration. In2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2022. doi: 10.1109/CVPR52688.2022.01688. [36] Ozan Özdenizci and Robert Legenstein. Restoring vision in adverse weather conditions with patch-based denoising diffusion models."},{"citing_arxiv_id":"2605.02144","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Projection-Free Transformers via Gaussian Kernel Attention","primary_cat":"cs.LG","submitted_at":"2026-05-04T01:57:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Gaussian Kernel Attention replaces learned QKV projections with a Gaussian RBF kernel on per-head token features, using 0.42x parameters and 0.49x FLOPs while showing competitive language modeling performance at depth 20.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27105","ref_index":9,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Automated Detection of Mutual Gaze and Joint Attention in Dual-Camera Settings via Dual-Stream Transformers","primary_cat":"cs.CV","submitted_at":"2026-04-29T18:49:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A dual-stream Transformer using frozen GazeLLE backbones and custom token fusion detects mutual gaze and joint attention from dual-camera recordings, outperforming CNN baselines and a multimodal LLM on caregiver-infant data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25884","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"QCalEval: Benchmarking Vision-Language Models for Quantum Calibration Plot Understanding","primary_cat":"quant-ph","submitted_at":"2026-04-28T17:28:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces QCalEval benchmark showing best zero-shot VLM score of 72.3 on quantum calibration plots, with fine-tuning and in-context learning effects varying by model type.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"multimodal in-context learning (MM-ICL): VL-ICL Bench [17] and recent analyses [18, 19] show that the effectiveness of in-context learning demonstrations is fragile and highly sensitive to prompt construction. Chart Understanding and Scientific Figures.Chart reasoning benchmarks evolved from Fig- ureQA [20] and DVQA [21] to PlotQA [22] and ChartQA [23], revealing that chart reasoning depends on OCR, numerical grounding, and structural relations [24]. Chart-specific models in- clude ChartOCR [25], DePlot [26], UniChart [27], ChartLlama [28], ChartInstruct [29], and Chart- Gemma [30], though evaluations show VLMs remain error-prone on scientific figures [31, 32]. Re- lated scientific-figure resources, such as SciCap [33] and Multimodal ArXiv [34], broaden this line"},{"citing_arxiv_id":"2604.25855","ref_index":22,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SIEVES: Selective Prediction Generalizes through Visual Evidence Scoring","primary_cat":"cs.CV","submitted_at":"2026-04-28T16:57:29+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIEVES improves selective prediction coverage by up to 3x on OOD VQA benchmarks by training a selector to score the quality of visual evidence produced by reasoner models, generalizing across benchmarks and proprietary models without internal access or per-task retraining.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Rodriguez and M. Rohrbach Benchmark #Q Format Avg. res. Domain V* Bench [46] 191 MC 2246×1582 Natural images: Attribute recognition, relative position HR-Bench-8k [43] 800 MC 7680×4320 Attribute recognition, OCR, map & chart analysis, spatial reasoning MME-RW-L [52] 1,782 MC 2000×1500 Remote sensing, autonomous driving, monitoring, diagrams & tables, OCR VizWiz [22] 4,319 OE 1224×1224 Blind-user mobile photos: yes/no, counting, other (33% unanswerable) AdVQA [35] 10,000 OE 640×480 Adversarially crafted: counting, OCR, rare concepts, reasoning Table 1: Out-Of-Distribution (OOD) Benchmarks.We evaluate SIEVES on benchmarks of varying sizes (#Q), using Multiple-Choice (MC) and Open-Ended (OE) formats, high and lower resolution images, and very diverse domains."},{"citing_arxiv_id":"2512.23221","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Holi-DETR: Holistic Fashion Item Detection Leveraging Contextual Information","primary_cat":"cs.CV","submitted_at":"2025-12-29T05:55:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Holi-DETR improves fashion item detection by integrating co-occurrence probabilities, inter-item spatial arrangements, and body keypoint relationships into the DETR architecture.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.05342","ref_index":52,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Delta Rectified Flow Sampling for Text-to-Image Editing","primary_cat":"cs.CV","submitted_at":"2025-09-01T21:51:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DRFS is a new inversion-free editing technique for rectified flow models that models source-target velocity discrepancies and applies a time-dependent shift to improve fidelity and unify prior methods like DDS and FlowEdit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.23323","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"FA-Seg: A Fast and Accurate Diffusion-Based Method for Open-Vocabulary Segmentation","primary_cat":"cs.CV","submitted_at":"2025-06-29T16:41:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FA-Seg delivers state-of-the-art training-free open-vocabulary segmentation performance (43.8% mIoU average) on standard benchmarks by extracting and refining attention from a single forward pass of a pretrained diffusion model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.05679","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Event-based Civil Infrastructure Visual Defect Detection: ev-CIVIL Dataset and Benchmark","primary_cat":"cs.CV","submitted_at":"2025-04-08T04:44:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents the ev-CIVIL dataset and benchmark showing that event-based cameras can support real-time detection of cracks and spalling in civil infrastructure under challenging lighting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}