{"total":46,"items":[{"citing_arxiv_id":"2605.21466","ref_index":60,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"StreamGVE: Training-Free Video Editing via Few-Step Streaming Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:52:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StreamGVE enables high-quality training-free video editing by converting the task to noise-to-data streaming generation with dual-branch fast sampling, self-attention bridges, cross-attention grounding, source-oriented guidance, and visual prompting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21381","ref_index":62,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Disentangling Generation and Regression in Stochastic Interpolants for Controllable Image Restoration","primary_cat":"cs.CV","submitted_at":"2026-05-20T16:41:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DiSI disentangles stochastic interpolants into separate generation and regression paths, allowing controllable transitions between regression and generative image restoration with a unified few-step sampler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17980","ref_index":26,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Learning to Balance: Decoupled Siamese Diffusion Transformer for Reference-Based Remote Sensing Image Super-Resolution","primary_cat":"cs.CV","submitted_at":"2026-05-18T07:35:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DS-DiT decouples low-resolution and reference interactions in a siamese diffusion transformer and adds a patch-level weights module plus autoguidance to improve reference-based super-resolution for remote sensing images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17087","ref_index":33,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"The Learnability Gap in Medical Latent Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-16T17:07:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pretrained autoencoders in medical latent diffusion encode discriminative features well for reconstruction but structure their latent spaces in ways that hinder classifier learning, a gap that persists across architectures and is not closed by domain fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16515","ref_index":43,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SeamCam: Quantifying Seamless Camouflage via Multi-Cue Visual Detectability","primary_cat":"cs.CV","submitted_at":"2026-05-15T18:08:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SeamCam quantifies camouflage by computing one minus the highest IoU recoverable from category-conditioned detection proposals against a ground-truth mask, achieving 78.82% agreement with human judgments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15181","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"From Plans to Pixels: Learning to Plan and Orchestrate for Open-Ended Image Editing","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A planner-orchestrator system learns long-horizon image editing by maximizing outcome-based rewards from a vision-language judge and refining plans from successful trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14487","ref_index":42,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Head Forcing: Long Autoregressive Video Generation via Head Heterogeneity","primary_cat":"cs.CV","submitted_at":"2026-05-14T07:27:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Head Forcing assigns tailored KV cache strategies to local, anchor, and memory attention heads plus head-wise RoPE re-encoding to extend autoregressive video generation from seconds to minutes without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11444","ref_index":48,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Leveraging Multimodal Large Language Models for All-in-One Image Restoration via a Mixture of Frequency Experts","primary_cat":"cs.CV","submitted_at":"2026-05-12T02:55:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"An MLLM-guided architecture with a mixture of frequency experts and relational alignment loss achieves state-of-the-art all-in-one image restoration, outperforming prior methods by up to 1.35 dB on the CDD11 dataset.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Supplemental Material A Detailed Experimental Settings A.1 Network Architecture We adopt Restormer [67] as our backbone architecture, where the number of Transformer blocks from level-1 to level-4 is configured as[4,6,6,8], respectively. The multi-dconv transposed attention employs attention heads of[1,2,4,8]with corresponding channel dimensions of[48,96,192,384]. In the refinement stage, we utilize 4 blocks, and the gated-dconv feed-forward network operates with a channel expansion factor ofγ= 2.66. The MoFE module is integrated after the downsampling operations and before the upsampling layers. Additionally, the MGFB is positioned before the Transformer blocks in the encoder stages and after the Transformer blocks in the decoder stages."},{"citing_arxiv_id":"2604.27590","ref_index":44,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Fake3DGS: A Benchmark for 3D Manipulation Detection in Neural Rendering","primary_cat":"cs.CV","submitted_at":"2026-04-30T08:41:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Fake3DGS benchmark shows state-of-the-art 2D fake detectors fail on 3D-manipulated Gaussian Splatting images while a new multi-view coherence method improves detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26232","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DepthPilot: From Controllability to Interpretability in Colonoscopy Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-29T02:24:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DepthPilot generates physically consistent and clinically interpretable colonoscopy videos by injecting depth priors into diffusion models through parameter-efficient fine-tuning and replacing linear denoising weights with adaptive splines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24575","ref_index":68,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Diffusion Model as a Generalist Segmentation Learner","primary_cat":"cs.CV","submitted_at":"2026-04-27T15:04:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiGSeg repurposes diffusion U-Nets as generalist segmentation learners by conditioning on image-mask latents and multi-scale CLIP text features, achieving strong cross-domain performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21066","ref_index":25,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Optimizing Diffusion Priors in Image Reconstruction from a Single Observation","primary_cat":"cs.CV","submitted_at":"2026-04-22T20:18:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Combining diffusion priors as a product-of-experts and optimizing exponents via Bayesian evidence maximization enables prior tuning from one observation in inverse imaging problems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20730","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Render-in-the-Loop: Vector Graphics Generation via Visual Self-Feedback","primary_cat":"cs.CV","submitted_at":"2026-04-22T16:15:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Render-in-the-Loop reformulates SVG generation as a step-wise visual-context-aware process using self-feedback from rendered intermediate states, VSF training, and RaV inference to outperform baselines on MMSVGBench for Text-to-SVG and Image-to-SVG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20038","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"FluSplat: Sparse-View 3D Editing without Test-Time Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-21T22:45:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FluSplat trains a model with geometric alignment constraints on multi-view edits to produce consistent 3D scene edits from sparse views in a single forward pass without test-time optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19238","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Allo{SR}$^2$: Rectifying One-Step Super-Resolution to Stay Real via Allomorphic Generative Flows","primary_cat":"cs.CV","submitted_at":"2026-04-21T08:44:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Allo{SR}^2 rectifies one-step super-resolution trajectories with allomorphic generative flows via SNR initialization, velocity supervision, and self-adversarial matching to deliver state-of-the-art fidelity and realism.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19009","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Guiding Distribution Matching Distillation with Gradient-Based Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-21T02:57:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GDMD replaces raw-sample rewards with distillation-gradient rewards in RL-guided diffusion distillation, yielding 4-step models that surpass their multi-step teachers on GenEval and human preference metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18215","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Memorize When Needed: Decoupled Memory Control for Spatially Consistent Long-Horizon Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-20T13:00:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A decoupled memory branch with hybrid cues, cross-attention, and gating improves spatial consistency and data efficiency in long-horizon camera-trajectory video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16114","ref_index":35,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Towards In-Context Tone Style Transfer with A Large-Scale Triplet Dataset","primary_cat":"cs.CV","submitted_at":"2026-04-17T14:49:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new 100k triplet dataset and in-context diffusion framework ICTone enable state-of-the-art tone style transfer by jointly conditioning on content and reference images with scorer-based reward learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13793","ref_index":40,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"From Synchrony to Sequence: Exo-to-Ego Generation via Interpolation","primary_cat":"cs.CV","submitted_at":"2026-04-15T12:32:25+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13509","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DiT as Real-Time Rerenderer: Streaming Video Stylization with Autoregressive Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2026-04-15T05:52:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RTR-DiT distills a bidirectional DiT teacher into an autoregressive few-step model using Self Forcing and Distribution Matching Distillation, plus a reference-preserving KV cache, to enable stable real-time text- and reference-guided video stylization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11386","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"ComSim: Building Scalable Real-World Robot Data Generation via Compositional Simulation","primary_cat":"cs.RO","submitted_at":"2026-04-13T12:25:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Compositional Simulation generates scalable real-world robot training data by combining classical simulation with neural simulation in a closed-loop real-sim-real augmentation pipeline.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"To validate the effectiveness of our proposed Neural Simulation in recovering real-world data distributions from simulation, we consider the following set of diverse comparative approaches: 1) Classical Simulation(Sim), denoting the canonical raw simulation pipeline without neural-driven refinement; 2) Baseline, a video-to-video generation model built on Stable Diffusion 1.5 [39] with temporal continuity post-processing [54]; 3) Zero-Shot, referring to the backbone model deployed without any sim-to-real fine-tuning; 4) Ours-CD, a variant of our proposed Neural Simulation framework, equipped with conditional generation capability guided solely by control dynamics for pseudo-realistic content synthesis; 5) Ours-VD, an alternative variant of our method, featuring"},{"citing_arxiv_id":"2604.11089","ref_index":51,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Structured State-Space Regularization for Generation-Friendly Image Tokenization","primary_cat":"cs.CV","submitted_at":"2026-04-13T07:10:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Structured state-space regularization induces spectral structure in image tokenizer latent spaces via an SSM-derived objective, improving generative performance with minimal reconstruction loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10789","ref_index":43,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"ReplicateAnyScene: Zero-Shot Video-to-3D Composition via Textual-Visual-Spatial Alignment","primary_cat":"cs.CV","submitted_at":"2026-04-12T19:42:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReplicateAnyScene performs fully automated zero-shot video-to-compositional-3D reconstruction by cascading alignments of generic priors from vision foundation models across textual, visual, and spatial dimensions.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"We then independently condition a diverse suite of generation models on multiple individual input views for the final selection of high-quality 3D assets. Finally, professional modelers manually place these assets against reference scene meshes to ensure accurate spatial layouts. occlusion, we apply generative completion [19] and super-resolution models [43] to restore missing details. We then manually select the highest-quality 3D as- set from these diverse candidates. Finally, professional 3D modelers manually align and place these selected assets against the reference scene meshes. This meticulous manual placement ensures physically accurate spatial locations and structural consistency in the final compositional layout."},{"citing_arxiv_id":"2604.10578","ref_index":35,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Rein3D: Reinforced 3D Indoor Scene Generation with Panoramic Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-12T10:55:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rein3D generates photorealistic, globally consistent 3D indoor scenes by using a restore-and-refine process where radial panoramic videos are restored via diffusion models and then used to update a 3D Gaussian field.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"globally consistent 3D scenes that remain stable under large viewpoint changes. fundamentally an ill-posed problem: Simple texts or image inputs fail to provide a comprehensive representation of the entire 3D space. Consequently, inferring massive amounts of missing information for unseen areas while maintaining ge- ometric consistency remains a significant challenge. Deep generative models, particularly diffusion models [13,17,34,35,37], ad- dress this by leveraging strong 2D visual priors. However, standard image-based methods [23,56,60-62] often suffer from accumulated geometric errors. While strategies like explicit constraints or multi-view synthesis [7,40,66] alleviate this issue, they remain computationally intensive and operationally cumber- some. In contrast, video diffusion methods [3,11,51], especially Video-to-Video"},{"citing_arxiv_id":"2604.09999","ref_index":19,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"GIF: A Conditional Multimodal Generative Framework for IR Drop Imaging in Chip Layouts","primary_cat":"cs.CV","submitted_at":"2026-04-11T03:00:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GIF fuses geometrical image features and logical graph topology in a conditional diffusion model to generate high-quality IR drop images for chip layouts, outperforming prior ML methods on CircuitNet-N28 with SSIM 0.78, Pearson 0.95, PSNR 21.77, and NMAE 0.026.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09213","ref_index":28,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SHIFT: Steering Hidden Intermediates in Flow Transformers","primary_cat":"cs.CV","submitted_at":"2026-04-10T11:07:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SHIFT learns and applies steering vectors to selected layers and timesteps in DiT models to suppress concepts, shift styles, or bias objects while keeping image quality and prompt adherence intact.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09100","ref_index":44,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Physically Grounded 3D Generative Reconstruction under Hand Occlusion using Proprioception and Multi-Contact Touch","primary_cat":"cs.CV","submitted_at":"2026-04-10T08:32:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A conditional diffusion model using proprioception and multi-contact touch produces metric-scale, physically consistent 3D object reconstructions under hand occlusion.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"sequent works explored representations such as point clouds [36,37,67], voxel 4 Gabriele M. Caddeo , Pasquale Marra , and Lorenzo Natake grids [29,38], Triplanes [6,48,72], or Gaussians mixtures [73], improving fidelity but frequently at high computational cost. To address efficiency, several recent methods generate shapes in a compact latent space [9,43,44,59,74], and rectified- flow formulations [1,32,33] have further accelerated sampling [68]. However, all these methods assume the objects are fully visible, without considering occlu- sions, limiting their use in real-world scenario. In parallel, a growing body of work studies partial observability, reconstructing or generating 3D shape from incomplete inputs [12,16]."},{"citing_arxiv_id":"2604.08716","ref_index":26,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"What Matters in Virtual Try-Off? Dual-UNet Diffusion Model For Garment Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-09T19:09:27+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A Dual-UNet diffusion model for virtual garment reconstruction from clothed images sets new benchmarks on VITON-HD and DressCode by optimizing Stable Diffusion variants, mask conditioning, and auxiliary losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08500","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Novel View Synthesis as Video Completion","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:44:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Video diffusion models can be adapted into permutation-invariant generators for sparse novel view synthesis by treating the problem as video completion and removing temporal order cues.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"contrast, we explore video diffusion models as scalable priors for sparse-view NVS, leveraging cross-frame consistency from web-scale video pretraining and directly repurposing pretrained video backbones for geometric reasoning without relying on large curated multi-view datasets. VideoDiffusionModelsand3D-AwareVideoGeneration.Modern video diffusion architectures [39,46] typically adopt latent diffusion frameworks [32] 4 Q. Wu et al. with spatio-temporal VAEs [4] and transformer-based denoisers [28], achieving strong temporal consistency and scalability through web-scale video pretrain- ing. Beyond generic video generation, several works incorporate camera control or3D-awareconditioningintovideomodels.Onelineofresearch[7,31,48,54]first reconstructs a scene and then applies video diffusion to inpaint missing regions"},{"citing_arxiv_id":"2604.08301","ref_index":28,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"GroundingAnomaly: Spatially-Grounded Diffusion for Few-Shot Anomaly Synthesis","primary_cat":"cs.CV","submitted_at":"2026-04-09T14:34:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GroundingAnomaly uses a Spatial Conditioning Module and Gated Self-Attention in a frozen diffusion U-Net to synthesize spatially accurate few-shot anomalies, reaching SOTA on MVTec AD and VisA for detection, segmentation, and instance detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08063","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"EEG2Vision: A Multimodal EEG-Based Framework for 2D Visual Reconstruction in Cognitive Neuroscience","primary_cat":"cs.CV","submitted_at":"2026-04-09T10:25:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EEG2Vision reconstructs images from EEG using diffusion models plus LLM-guided boosting, with reconstruction quality holding up reasonably as electrode count drops from 128 to 24 channels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07329","ref_index":27,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Distilling Photon-Counting CT into Routine Chest CT through Clinically Validated Degradation Modeling","primary_cat":"cs.CV","submitted_at":"2026-04-08T17:47:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SUMI distills photon-counting CT quality into routine chest CT by learning to reverse clinically validated acquisition degradations, yielding 15-20% gains in image metrics, better radiologist utility, and up to 15% higher lesion detection sensitivity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06989","ref_index":25,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Generative Phomosaic with Structure-Aligned and Personalized Diffusion","primary_cat":"cs.CV","submitted_at":"2026-04-08T12:06:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper presents the first generative photomosaic framework that synthesizes tiles via structure-aligned diffusion models and few-shot personalization instead of color-based matching from large tile collections.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06161","ref_index":74,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DiffHDR: Re-Exposing LDR Videos with Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-07T17:56:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiffHDR converts LDR videos to HDR by formulating the task as generative radiance inpainting in a video diffusion model's latent space, using Log-Gamma encoding and synthesized training data to achieve better fidelity and stability than prior methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Several methods explicitly incorporate inpainting modules to hallucinate missing details in saturated re- gions [23,60,111]. However, when using limited-capacity generative models, the synthesized content often lacks realism or fine details. 2.3 Generative HDR Advancesingenerativemodeling,includingGANs[4,9,10,22,40,48-50,79,83,106] and diffusion models [3,16,31,34,39,67,74,88-90,96,102,105,107,108,112,113], have shown strong priors for image and video generation. Some approaches learn themappingfromLDRimagestoHDRusingonlyLDRvideos,withoutrequiring HDR supervision [5]. Similarly, GlowGAN [85] enables GAN-based HDR image generation by learning from the distribution of LDR content. Diffusion models, in particular, have demonstrated strong capability in gen-"},{"citing_arxiv_id":"2604.04646","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Training-Free Refinement of Flow Matching with Divergence-based Sampling","primary_cat":"cs.CV","submitted_at":"2026-04-06T12:54:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Flow Divergence Sampler refines flow matching by computing velocity field divergence to correct ambiguous intermediate states during inference, improving fidelity in text-to-image and inverse problem tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04608","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Beyond Semantics: Uncovering the Physics of Fakes via Universal Physical Descriptors for Cross-Modal Synthetic Detection","primary_cat":"cs.CV","submitted_at":"2026-04-06T11:50:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Five universal physical descriptors including Laplacian variance, Sobel statistics, and residual noise variance, when integrated as text encodings with CLIP, achieve up to 99.8% accuracy detecting synthetic images across GAN and diffusion model datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03462","ref_index":25,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SpectralSplat: Appearance-Disentangled Feed-Forward Gaussian Splatting for Driving Scenes","primary_cat":"cs.CV","submitted_at":"2026-04-03T21:12:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpectralSplat disentangles appearance from geometry in feed-forward 3D Gaussian Splatting by factoring color into base and adapted streams conditioned on DINOv2 embeddings, trained on paired data from a hybrid relighting pipeline.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Thisphysics-basedreference ˆImv v guaranteesglobalilluminationconsistencyacross views but lacks photorealistic high-frequency details (e.g. specularities, sky tex- tures), so we use it as a structural guidance signal for the generative stage. Generative Refinement via IC-Light.We refineˆImv v with IC-Light [48], a re- lighting diffusion model adapted from Stable Diffusion [25]. While IC-Light pro- duces photorealistic lighting effects, applying it independently per view breaks multi-view consistency. Frequency-Aware Latent Guidance.To reconcile 3D consistency with per- ceptual quality, we intervene in the DDIM [31] sampling trajectory via spectral decoupling. We decompose any VAE latentzinto low-frequency (structural) and high-frequency (textural) components using a Gaussian low-pass operator"},{"citing_arxiv_id":"2604.06061","ref_index":35,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"PromptEvolver: Prompt Inversion through Evolutionary Optimization in Natural-Language Space","primary_cat":"cs.LG","submitted_at":"2026-04-03T17:00:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PromptEvolver recovers high-fidelity natural language prompts for given images by evolving them via genetic algorithm guided by a vision-language model, outperforming prior methods on benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"language model to guide the evolution process. Importantly, it works on black-box generation models by requiring only image outputs. Finally, we evaluate PromptEvolver across multiple prompt inversion benchmarks and show that it consistently outperforms competing methods. Keywords:Prompt inversion·Text to image generation 1 Introduction Text-to-image (T2I) diffusion models [21,35,48] have transformed visual con- tent creation, enabling users to generate photorealistic images from natural- language prompts. Yet the quality of the generated image depends critically on the prompt, when even small changes in wording can produce dramatically dif- ferent outputs. In practice, users engage in extensive trial-and-error, manually crafting, adjusting, and iterating on prompts to achieve a desired visual result."},{"citing_arxiv_id":"2604.03156","ref_index":35,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"CAMEO: A Conditional and Quality-Aware Multi-Agent Image Editing Orchestrator","primary_cat":"cs.CV","submitted_at":"2026-04-03T16:27:02+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Subsequent works further enhance controllability and semantic alignment, including Prompt-to-Prompt [11], DiffEdit [7], Imagic [18], Plug-and-Play Diffusion Features [43], and ControlNet [59]. More recent approaches explore richer instruction interfaces and multimodal reasoning, such as MGIE [9] and GenArtist [46], while subject-driven and compositional editing are studied in DreamBooth [35], Blended Diffusion [1], SDEdit [25], and image translation methods such as Detail Fusion GAN [ 20]. Commercial systems such as Qwen Image Edit Plus, FLUX 2 Pro, Seedream 4.5, and Nano Banana Pro further demonstrate strong progress in controllability and fidelity. At the same time, recent studies reveal that modern multimodal and editing systems remain vulnerable to robustness, safety, and misinformation-related issues, highlighting the need for stronger"},{"citing_arxiv_id":"2604.02867","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"HairOrbit: Multi-view Aware 3D Hair Modeling from Single Portraits","primary_cat":"cs.CV","submitted_at":"2026-04-03T08:35:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HairOrbit leverages video generation priors and a neural orientation extractor to achieve state-of-the-art strand-level 3D hair reconstruction from single-view portraits in visible and invisible regions.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"minimizing the forgetting of its pre-learned real-world hair priors, we fine-tune the model with LoRA applied to the Q, K, V, O projections and the first and last linear layers (FFN.0 and FFN.2) of each transformer block. We prepare multi-view renderings from carefully selected typical 3D hairstyles, covering a wide range of variations in length, curliness, and partition. Following diffusion- based [23] video generation models, we optimize the LoRA parameters using a standard noise prediction loss. Given a clean latentx0 and a timestept∼ U(1, T), a noisy latentxt is obtained by the forward diffusion process: xt = √¯αt x0 + √ 1−¯αt ϵ,ϵ∼ N(0,I). (1) where¯αt denotes the cumulative noise schedule. The modelϵθ(xt, t,c)predicts the noiseϵconditioned onc, which in our case corresponds to the latent repre-"},{"citing_arxiv_id":"2603.19538","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MoCA3D: Monocular 3D Bounding Box Prediction in the Image Plane","primary_cat":"cs.CV","submitted_at":"2026-03-20T00:33:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MoCA3D formulates monocular 3D box prediction as dense pixel-space tasks using corner heatmaps and depth maps, with a new PAG metric for image-plane evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.15525","ref_index":26,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Clinically Aware Synthetic Image Generation for Concept Coverage in Chest X-ray Models","primary_cat":"cs.CV","submitted_at":"2026-03-16T16:48:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CARPA generates anatomically faithful synthetic chest X-rays with controlled clinical concept insertions and deletions to expand training coverage and improve model precision, calibration, and reliability on real benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.08090","ref_index":51,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DSH-Bench: A Difficulty- and Scenario-Aware Benchmark with Hierarchical Subject Taxonomy for Subject-Driven Text-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2026-03-09T08:30:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DSH-Bench is a benchmark for subject-driven T2I generation that uses hierarchical taxonomy sampling, difficulty/scenario classification, and a new SICS metric showing 9.4% higher human correlation than prior measures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.10764","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Dual-End Consistency Model","primary_cat":"cs.CV","submitted_at":"2026-02-11T11:51:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DE-CM reaches state-of-the-art one-step FID of 1.70 on ImageNet 256x256 by decomposing PF-ODE trajectories into three critical sub-trajectories and using flow matching plus N2N mapping for stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.12406","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Generalized SAM: Efficient Fine-Tuning of SAM for Variable Input Image Sizes","primary_cat":"cs.CV","submitted_at":"2024-08-22T13:58:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"GSAM applies random cropping to enable variable input sizes for efficient SAM fine-tuning, claiming lower compute with comparable or higher accuracy on varied datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.07519","ref_index":16,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"InstantID: Zero-shot Identity-Preserving Generation in Seconds","primary_cat":"cs.CV","submitted_at":"2024-01-15T07:50:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InstantID enables zero-shot identity-preserving image generation from one facial image via a novel IdentityNet that combines strong semantic and weak spatial conditioning with text prompts in diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}