{"total":24,"items":[{"citing_arxiv_id":"2606.26295","ref_index":46,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Beyond Aesthetics: Quantifying Information Loss in Turbid Scenes","primary_cat":"cs.CV","submitted_at":"2026-06-24T18:40:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces the TUB dataset of 1320 real turbid underwater images and PCD metric showing strong correlation with instance segmentation performance where standard metrics fail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24449","ref_index":71,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"SENTRY: SAM2-Enhanced Neighbor-Aware and Temporally Reasoned Memory for Visual Tracking","primary_cat":"cs.CV","submitted_at":"2026-06-23T11:35:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SENTRY is a plug-and-play module that replaces confidence-based memory writes with neighbor-aware cycle-consistent validation in SAM2 trackers, yielding new zero-shot SOTA results on LaSOT, GOT-10k and other benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09076","ref_index":50,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Beyond Scalar Rewards by Internalizing Reasoning into Score Distributions","primary_cat":"cs.CV","submitted_at":"2026-06-08T06:20:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Z-Reward trains a 27B reasoning teacher VLM on score distributions via GDSO and distills it via RISD into a 9B student, reaching 89.6% and 88.6% human preference accuracy with 41.3% optimization gain over SFT baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08415","ref_index":3,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"CoVEBench: Can Video Editing Models Handle Complex Instructions?","primary_cat":"cs.CV","submitted_at":"2026-06-07T02:29:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoVEBench is a new benchmark showing that existing text-guided video editing models frequently fail on compositional instructions involving simultaneous subject, action, and camera changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04857","ref_index":23,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Rethinking Incompleteness: Formalizing Protocol Divergence and Train-Once Learning for Robust IMVC","primary_cat":"cs.LG","submitted_at":"2026-06-03T13:24:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Formalizes incompleteness divergence across missing-data protocols in IMVC and proposes CRAFT, a mask-aware transformer enabling train-once robustness to diverse missing patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07658","ref_index":37,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"What neurosurgeons need to see: synthetic intra-operative MRI from ultrasound for brain-shift compensation in brain tumour surgery","primary_cat":"cs.CV","submitted_at":"2026-06-03T11:17:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"End-to-end pipeline uses ResViT-2.5D to synthesize post-resection MRI from ioUS then anchors deformable registration, yielding 5.86 mm TRE on 14 ReMIND subjects while producing an integrated whole-brain volume reflecting intraoperative state.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00122","ref_index":25,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Mathematical framework for perception-driven parameter choice in image denoising","primary_cat":"eess.IV","submitted_at":"2026-05-28T09:31:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Authors create psychometrically scaled image sets from human tests on denoised photos and provide a HaarPSI threshold for choosing denoising parameters based on perceived similarity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22192","ref_index":14,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Ultra-High-Definition Image Quality Assessment via Graph Representation Learning","primary_cat":"cs.CV","submitted_at":"2026-05-21T08:57:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UHD-GCN-BIQA models structural dependencies among sampled patches via a hybrid kNN graph and residual graph convolutions to achieve competitive PLCC and SRCC with the lowest RMSE on the UHD-IQA benchmark for blind ultra-high-definition image quality assessment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21301","ref_index":22,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Automatic Discovery of Disease Subgroups by Contrasting with Healthy Controls","primary_cat":"cs.LG","submitted_at":"2026-05-20T15:31:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Deep UCSL uses a contrastive EM loss on patient-control labels to isolate disease-driven subgroups in medical imaging by suppressing shared healthy variability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03885","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Raising the Ceiling: Better Empirical Fixation Densities for Saliency Benchmarking","primary_cat":"cs.CV","submitted_at":"2026-05-05T15:45:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A mixture model with adaptive KDE and per-image cross-validation raises estimated human fixation consistency by 5-15% median log-likelihood and up to 2 AUC points over fixed-bandwidth Gaussian baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Incontrast,foralsocommonlyusedmap-comparisonmetrics(e.g.,CC,KL- divergence), the empirical density is itself the target map; changing the density changes model scores and can therefore affect rankings and scientific conclusions about which models are better [38]. This dependence becomes more consequential as the field shifts from ag- gregate leaderboard comparisons towardsample-levelevaluation [19]: per-image analyses [54], inverse benchmarking [14] (finding stimuli of maximal model er- ror), and fine-grained failure mode characterization [9]. These settings require reliable per-image gold standards. If density estimates vary substantially in qual- ity across images, then sample-level conclusions can become unstable precisely for the images where detailed analysis matters most."},{"citing_arxiv_id":"2605.01283","ref_index":107,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Developing a Strong Pre-Trained Base Model for Plant Leaf Disease Classification","primary_cat":"cs.CV","submitted_at":"2026-05-02T06:33:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A DenseNet201 base model trained on a constructed plant leaf disease dataset outperforms baselines and enables faster, more robust transfer learning with less data than general models.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"As is, the dataset contains a total of 3 classes, all diseases, after removing the blossoms and fruits, as can be seen in Table A.13. Images are all taken in ﬁ eld (see Figure A.26), resulting in a rather well balanced dataset (see Figure A.25). This data has been utilized in a number of studies (e.g. [105, 106]). 4.3 Hybrid Datasets 4.3.1 plantDoc The plantDoc dataset [15, 107] is a multi-plant dataset that features a wide variety of plant across 28 total classes (see Table A.14). It includes images are taken in ﬁ eld, but when looking at ﬁ gure A.28, it becomes clear that the dataset also includes images that realistically should have been excluded from the dataset (e.g. PowerPoint slides). Class distribution is not too bad,"},{"citing_arxiv_id":"2604.27364","ref_index":53,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hyperspectral Image Classification via Efficient Global Spectral Supertoken Clustering","primary_cat":"cs.CV","submitted_at":"2026-04-30T03:20:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DSCC groups spectrally similar and spatially close pixels into supertokens using multi-criteria distance and soft labels, then classifies at the token level to achieve 0.728 CF1 at 197.75 FPS on WHU-OHS.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19025","ref_index":42,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"RoomRecon: High-Quality Textured Room Layout Reconstruction on Mobile Devices","primary_cat":"cs.RO","submitted_at":"2026-04-21T03:20:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RoomRecon delivers a real-time mobile system for high-quality textured 3D room reconstructions that combines AR-guided imaging with generative AI texturing focused on permanent structures and claims to outperform prior methods in quality and speed.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"This evaluation is referred to aslocal texture quality assessmentbecause it examines only the specific, limited areas of the scene captured by the purposely curated GT. Consequently, the evaluation focuses on partial areas, whether it be portions of a sin- gle plane or encompassing multiple planes. Masked GT images and masked rendered images are compared for four metrics, namely PSNR [42], SSIM [39], Shift Tolerant Learned Perceptual Image Patch Similarity (ST-LPIPS) [19], and Blurriness [11]. These met- rics are selected, as done by prior work [15, 45], to ascertain the similarity of textured rendered outputs to GT images (PSNR, SSIM, and ST-LPIPS), and to evaluate the sharpness and vividness of the textured outputs using the Blurriness measure, which is GT-"},{"citing_arxiv_id":"2604.07427","ref_index":50,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Personalizing Text-to-Image Generation to Individual Taste","primary_cat":"cs.CV","submitted_at":"2026-04-08T17:35:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PAMELA provides a multi-user rating dataset and personalized reward model that predicts individual image preferences more accurately than prior population-level aesthetic models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Score(absolute ratings) orPairwise(comparative preferences of 2 images) orRanking(ordinal ranking of multiple images). †Note that Pick-a-Pic v2 includes user IDs but was not designed for per-user analysis; per-user splits must be reconstructed post hoc. Dataset Y ear Label # Ratings # Images # Users Ratings per Image User-Level Labels Subjective Domains Image Source Classical IQA Datasets AVA [39] 2012 Score 255K 255K∼25K∼200✗ ✗Real photos LIVE [50] 2006 Score 779 779 29 1✗ ✗Real photos KADID-10K [32] 2019 Score 30K 10.1K 25 3✗ ✗Distorted AI-Generated IQA Datasets SAC [43] 2022 Score 238K 238K Crowd∼1✗ ✗AI-generated AGIQA-3K [29] 2023 Score 125,244 2,982 21 2✗ ✗AI-generated T2I Human Preference Datasets HPD v1 [57] 2023 Pairwise 98,807 98,807 2,659 1✗ ✗Stable Diff. ImageRewardDB [58] 2023 Pairwise 137K∼100K Expert 1✗ ✗Multi-model"},{"citing_arxiv_id":"2604.09704","ref_index":1,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Multi-Granularity Reasoning for Image Quality Assessment via Attribute-Aware Reinforcement Learning to Rank","primary_cat":"cs.CV","submitted_at":"2026-04-07T16:07:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MG-IQA trains vision-language models with attribute-aware RL2R and a multi-dimensional Thurstone reward model to jointly predict overall quality and fine-grained attributes, reporting 2.1% average SRCC gains on eight IQA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05375","ref_index":6,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"DAT: Dual-Aware Adaptive Transmission for Efficient Multimodal LLM Inference in Edge-Cloud Systems","primary_cat":"cs.MM","submitted_at":"2026-04-07T03:21:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DAT combines a small-large model cascade with fine-tuning and bandwidth-aware multi-stream transmission to deliver high-accuracy event recognition and low-latency alerts for video streams in edge-cloud systems.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"than delivering task-oriented multimodal outputs after collabora- tive inference. In contrast, machine vision systems prioritize seman- tically important information related to detection, recognition, and structured understanding. VCM argues that transmission should be organized around machine-task-relevant information rather than reconstructable pixels alone [6], and AITransfer suggests jointly considering content importance and network dynamics [49]. Nev- ertheless, few existing solutions provide a unified transmission ap- proach that simultaneously supports both task-oriented semantic understanding and the preservation of rich visual evidence required for downstream analysis. 3 Design of DAT Architecture"},{"citing_arxiv_id":"2509.22414","ref_index":16,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"LucidFlux: Caption-Free Photo-Realistic Image Restoration via a Large-Scale Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2025-09-26T14:39:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LucidFlux is a caption-free image restoration method that conditions a Flux.1 diffusion transformer with a dual-branch module from the degraded input and a proxy restoration plus SigLIP semantic features to outperform baselines on synthetic and real-world data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.20765","ref_index":233,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Looking Beyond the Obvious: A Survey on Abstract Concept Recognition for Video Understanding","primary_cat":"cs.CV","submitted_at":"2025-08-28T13:19:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A literature survey on abstract concept recognition in videos that catalogs prior tasks and datasets while advocating for foundation models and reuse of decades of community experience.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"will highlight existing studies in the image domain here. We hope that future work also addresses this in the video domain. In [230], the authors propose a method to generate persuasive story- lines for video ads using the Wundt curve [231] to model persuasiveness. They quantify information (via structural dissimilarity [232]), attractiveness (using NIMA [233]), and emotion (arousal via MobileNet [234] trained on [235]). Instead of pro- cessing video directly, image-based models extract these features to learn a simple function based on the Wundt curve. This approach shows how abstract concepts can be modeled and transferred from images to video, even in a low-data regime. Images and videos shared on different media"},{"citing_arxiv_id":"2409.06406","ref_index":4,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Monitoring road infrastructures from satellite images in Greater Maputo","primary_cat":"stat.AP","submitted_at":"2024-09-10T10:41:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Object-oriented RGB pixel distribution analysis from satellite images classifies paved versus unpaved roads in Greater Maputo.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2307.03017","ref_index":42,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"RealLiFe: Real-Time Light Field Reconstruction via Hierarchical Sparse Gradient Descent","primary_cat":"cs.CV","submitted_at":"2023-07-06T14:31:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RealLiFe optimizes multi-plane images with HSGD to deliver real-time light field reconstruction from sparse views, claiming 100x speedup over offline methods and 2 dB PSNR gain over online ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.12349","ref_index":18,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Error Correction for Discrete Tomography","primary_cat":"math.CO","submitted_at":"2022-04-26T14:41:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fewer than d/2 errors in line sums can be corrected in discrete tomography, with the bound shown to be optimal.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.11762","ref_index":72,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Multivariate Pointwise Information-Driven Data Sampling and Visualization","primary_cat":"cs.HC","submitted_at":"2019-07-26T19:32:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A pointwise multivariate information-driven sampling method generates reduced datasets that preserve statistical associations among variables for effective feature queries and analysis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.09236","ref_index":67,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"RGB-D image-based Object Detection: from Traditional Methods to Deep Learning Techniques","primary_cat":"cs.CV","submitted_at":"2019-07-22T11:18:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A survey of RGB-D object detection from traditional hand-crafted features with machine learning to deep learning techniques.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.04983","ref_index":29,"ref_count":1,"confidence":0.5,"is_internal_anchor":false,"paper_title":"Aesthetic Attributes Assessment of Images","primary_cat":"cs.CV","submitted_at":"2019-07-11T03:25:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper proposes the Aesthetic Multi-Attribute Network (AMAN) that jointly predicts captions and scores for five aesthetic attributes using a new weakly-labeled dataset created via knowledge transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}