{"total":49,"items":[{"citing_arxiv_id":"2605.13581","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HIR-ALIGN: Enhancing Hyperspectral Image Restoration via Diffusion-Based Data Generation","primary_cat":"cs.CV","submitted_at":"2026-05-13T14:14:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HIR-ALIGN augments limited target data for hyperspectral restoration by creating proxy clean images, synthesizing aligned HSIs with blur-robust diffusion and warp-based transfer, then finetuning models to lower target-domain risk.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12967","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ImageAttributionBench: How Far Are We from Generalizable Attribution?","primary_cat":"cs.CV","submitted_at":"2026-05-13T04:01:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ImageAttributionBench is a benchmark dataset demonstrating that state-of-the-art image attribution methods lack robustness to image degradation and fail to generalize to semantically disjoint domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10198","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Empty SPACE: Cross-Attention Sparsity for Concept Erasure in Diffusion Models","primary_cat":"cs.LG","submitted_at":"2026-05-11T08:46:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SPACE induces sparsity in cross-attention parameters via closed-form iterative updates to erase target concepts more effectively than dense baselines in large diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09296","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Micro-Defects Expose Macro-Fakes: Detecting AI-Generated Images via Local Distributional Shifts","primary_cat":"cs.CV","submitted_at":"2026-05-10T03:44:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MDMF detects AI-generated images by learning patch-level forensic signatures and quantifying their distributional discrepancies with MMD, yielding larger separation than global methods when micro-defects are present.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09003","ref_index":34,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FlashClear: Ultra-Fast Image Content Removal via Efficient Step Distillation and Feature Caching","primary_cat":"cs.CV","submitted_at":"2026-05-09T15:39:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlashClear delivers up to 122x faster object removal than prior diffusion models via adversarial step distillation and asymmetric attention caching while preserving visual quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06143","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AI-Generated Images: What Humans and Machines See When They Look at the Same Image","primary_cat":"cs.CV","submitted_at":"2026-05-07T12:40:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Researchers train AI detectors on a large photorealistic fake image dataset, apply 16 XAI methods, and use human survey feedback to assess alignment between machine explanations and human perception of AI-generated images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04590","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Diffusion to Rectified Flow: Rethinking Text-Based Segmentation","primary_cat":"cs.CV","submitted_at":"2026-05-06T07:40:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RLFSeg repurposes pretrained generative models via Rectified Flow for direct latent-space image-to-mask mapping in text-based segmentation, outperforming diffusion-based methods especially in zero-shot cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04366","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Conditional Flow-VAE for Safety-Critical Traffic Scenario Generation","primary_cat":"cs.RO","submitted_at":"2026-05-06T00:08:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A conditional flow matching model generates realistic safety-critical traffic scenarios by turning nominal scenes into dangerous rollouts using combined simulation and real data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04358","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Intermediate Representations are Strong AI-Generated Image Detectors","primary_cat":"cs.CV","submitted_at":"2026-05-05T23:26:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Intermediate layer embedding sensitivity to perturbations distinguishes AI-generated images from real ones, yielding higher AUROC on GenImage and Forensics Small benchmarks than prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03413","ref_index":93,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning to Theorize the World from Observation","primary_cat":"cs.LG","submitted_at":"2026-05-05T06:39:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NEO induces compositional latent programs as world theories from observations and executes them to enable explanation-driven generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02849","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Active Sampling for Ultra-Low-Bit-Rate Video Compression via Conditional Controlled Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-04T17:25:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ActDiff-VC achieves up to 64.6% bitrate reduction at matched NIQE and improves perceptual metrics like KID and FID by using content-adaptive keyframe selection and budget-aware sparse trajectory selection to condition a diffusion decoder for ultra-low-bitrate video reconstruction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25358","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking Layout-Guided Diffusion Models through Unified Semantic-Spatial Evaluation in Closed and Open Settings","primary_cat":"cs.CV","submitted_at":"2026-04-28T08:25:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces closed-set C-Bench and open-set O-Bench for layout-guided diffusion models, a unified semantic-spatial scoring protocol, and ranks six models after generating and evaluating 319,086 images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24885","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VibeToken: Scaling 1D Image Tokenizers and Autoregressive Models for Dynamic Resolution Generations","primary_cat":"cs.CV","submitted_at":"2026-04-27T18:08:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VibeToken enables autoregressive image generation at arbitrary resolutions using 64 tokens for 1024x1024 images with 3.94 gFID, constant 179G FLOPs, and better efficiency than diffusion or fixed AR baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20258","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Where to Edit: Task-Aware Localization for Instruction-Based Image Editing","primary_cat":"cs.CV","submitted_at":"2026-04-22T07:08:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Task-aware localization via attention cues and feature centroids from source/target streams in IIE models improves non-edit consistency while preserving instruction following.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16879","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adaptive Forensic Feature Refinement via Intrinsic Importance Perception","primary_cat":"cs.CV","submitted_at":"2026-04-18T07:07:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"I2P adaptively selects the most discriminative layers from visual foundation models for synthetic image detection and constrains task updates to low-sensitivity parameter subspaces to improve specificity without harming generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13863","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PostureObjectstitch: Anomaly Image Generation Considering Assembly Relationships in Industrial Scenarios","primary_cat":"cs.CV","submitted_at":"2026-04-15T13:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PostureObjectStitch generates assembly-aware anomaly images by decoupling multi-view features into high-frequency, texture and RGB components, modulating them temporally in a diffusion model, and applying conditional loss plus geometric priors to preserve correct component relationships.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13841","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DiffMagicFace: Identity Consistent Facial Editing of Real Videos","primary_cat":"cs.CV","submitted_at":"2026-04-15T13:13:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DiffMagicFace uses concurrent fine-tuned text and image diffusion models plus a rendered multi-view dataset to achieve identity-consistent text-conditioned editing of real facial videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09850","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training-Free Object-Background Compositional T2I via Dynamic Spatial Guidance and Multi-Path Pruning","primary_cat":"cs.CV","submitted_at":"2026-04-10T19:25:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A training-free method with time-dependent attention gating and trajectory pruning enhances object-background balance in diffusion-based image synthesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08364","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MegaStyle: Constructing Diverse and Scalable Style Dataset via Consistent Text-to-Image Style Mapping","primary_cat":"cs.CV","submitted_at":"2026-04-09T15:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A scalable pipeline generates an intra-consistent, inter-diverse 1.4M style image dataset from text-to-image models and uses it to train a style encoder and generalizable style transfer model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09715","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MuPPet: Multi-person 2D-to-3D Pose Lifting","primary_cat":"cs.CV","submitted_at":"2026-04-08T12:29:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MuPPet introduces person encoding, permutation augmentation, and dynamic multi-person attention to outperform prior single- and multi-person 2D-to-3D pose lifting methods on group interaction datasets while improving occlusion robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05730","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Controllable Image Generation with Composed Parallel Token Prediction","primary_cat":"cs.LG","submitted_at":"2026-04-07T11:33:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new formulation for composing discrete generative processes enables precise control over novel condition combinations in image generation, cutting error rates by 63% and speeding up inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04575","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Erasure or Erosion? Evaluating Compositional Degradation in Unlearned Text-To-Image Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T10:16:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unlearning methods that strongly erase concepts from text-to-image diffusion models consistently degrade performance on attribute binding, spatial reasoning, and counting tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04172","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GENFIG1: Visual Summaries of Scholarly Work as a Challenge for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-05T16:30:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GENFIG1 is a new benchmark that tests whether vision-language models can create effective Figure 1 visuals capturing the central scientific idea from paper text.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.26357","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MPDiT: Multi-Patch Global-to-Local Transformer Architecture For Efficient Flow Matching and Diffusion Model","primary_cat":"cs.CV","submitted_at":"2026-03-27T12:30:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MPDiT uses a hierarchical multi-patch design in transformers to lower computation in diffusion models by handling coarse global features first then fine local details, plus faster-converging embeddings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.03233","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LTX-2: Efficient Joint Audio-Visual Foundation Model","primary_cat":"cs.CV","submitted_at":"2026-01-06T18:24:41+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LTX-2 generates high-quality synchronized audiovisual content from text prompts via an asymmetric 14B-video / 5B-audio dual-stream transformer with cross-attention and modality-aware guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.00103","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LTX-Video: Realtime Video Latent Diffusion","primary_cat":"cs.CV","submitted_at":"2024-12-30T19:00:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LTX-Video integrates Video-VAE and transformer for 1:192 latent compression and real-time video diffusion by moving patchifying to the VAE and letting the decoder finish denoising in pixel space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.12528","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Show-o: One Single Transformer to Unify Multimodal Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2024-08-22T16:32:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Show-o unifies autoregressive and discrete diffusion modeling inside one transformer to support multimodal understanding and generation tasks with competitive benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.06525","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation","primary_cat":"cs.CV","submitted_at":"2024-06-10T17:59:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Scaled vanilla autoregressive models based on Llama achieve 2.18 FID on ImageNet 256x256 image generation, beating popular diffusion models without visual inductive biases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.02101","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CameraCtrl: Enabling Camera Control for Text-to-Video Generation","primary_cat":"cs.CV","submitted_at":"2024-04-02T16:52:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CameraCtrl enables accurate camera pose control in video diffusion models through a trained plug-and-play module and dataset choices emphasizing diverse camera trajectories with matching appearance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"but holds significant value in terms of achieving desired results. To accomplish this, we address this problem by considering three key questions: (1) How can we effectively represent the camera condition to reflect the geometric movement in 3D space? (2) How can we seamlessly inject the camera condition into existing video generators without compromising frame quality and temporal consistency? (3) What type of training data should be utilized to ensure proper model training? This section is thus organized as follows: Sec. 3.1 presents a brief background discussion of video generation models; Sec. 3.2 introduces the camera representation used by CameraCtrl; Sec. 3.3 presents the camera model Φc for injecting camera representation into the video diffusion models."},{"citing_arxiv_id":"2403.05135","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment","primary_cat":"cs.CV","submitted_at":"2024-03-08T08:08:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ELLA introduces a timestep-aware semantic connector to link LLMs with diffusion models for improved dense prompt following, validated on a new 1K-prompt benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.04378","ref_index":73,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference","primary_cat":"cs.CV","submitted_at":"2023-10-06T17:11:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Latent Consistency Models enable high-fidelity text-to-image generation in 2-4 steps by directly predicting solutions to the probability flow ODE in latent space, distilled from pre-trained LDMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.06721","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image Diffusion Models","primary_cat":"cs.CV","submitted_at":"2023-08-13T08:34:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IP-Adapter adds effective image prompting to text-to-image diffusion models using a lightweight decoupled cross-attention adapter that works alongside text prompts and other controls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.06571","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ModelScope Text-to-Video Technical Report","primary_cat":"cs.CV","submitted_at":"2023-08-12T13:53:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ModelScopeT2V is a 1.7-billion-parameter text-to-video model built on Stable Diffusion that adds temporal modeling and outperforms prior methods on three evaluation metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2307.04725","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning","primary_cat":"cs.CV","submitted_at":"2023-07-10T17:34:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A single motion module trained on videos adds temporally coherent animation to any personalized text-to-image model derived from the same base without additional tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2307.01952","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis","primary_cat":"cs.CV","submitted_at":"2023-07-04T23:04:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SDXL improves upon prior Stable Diffusion versions through a larger UNet backbone, dual text encoders, novel conditioning, and a refinement model, producing higher-fidelity images competitive with black-box state-of-the-art generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.01469","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consistency Models","primary_cat":"cs.LG","submitted_at":"2023-03-02T18:30:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Consistency models achieve fast one-step generation with SOTA FID of 3.55 on CIFAR-10 and 6.20 on ImageNet 64x64 by directly mapping noise to data, outperforming prior distillation techniques.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2212.09748","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scalable Diffusion Models with Transformers","primary_cat":"cs.CV","submitted_at":"2022-12-19T18:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DiTs achieve SOTA FID of 2.27 on ImageNet 256x256 by scaling transformer-based latent diffusion models, with performance improving consistently as Gflops increase.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.08402","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LAION-5B: An open large-scale dataset for training next generation image-text models","primary_cat":"cs.CV","submitted_at":"2022-10-16T00:08:18+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LAION-5B is an openly released dataset of 5.85 billion CLIP-filtered image-text pairs that enables replication of foundational vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.02303","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Imagen Video: High Definition Video Generation with Diffusion Models","primary_cat":"cs.CV","submitted_at":"2022-10-05T14:41:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Imagen Video generates high-definition text-conditional videos via a cascade of base and super-resolution diffusion models, achieving high fidelity and controllability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2209.14792","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Make-A-Video: Text-to-Video Generation without Text-Video Data","primary_cat":"cs.CV","submitted_at":"2022-09-29T13:59:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Make-A-Video achieves state-of-the-art text-to-video generation by decomposing temporal U-Net and attention structures to add space-time modeling to text-to-image models, trained without any paired text-video data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2209.14687","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Diffusion Posterior Sampling for General Noisy Inverse Problems","primary_cat":"stat.ML","submitted_at":"2022-09-29T11:12:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Diffusion models solve noisy (non)linear inverse problems via approximated posterior sampling that blends diffusion steps with manifold gradients without strict consistency projection.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"∥x0 − ˆx0∥dP (x0|xt) (48) (d) ≤ d√ 2πσ 2 e−1/2σ2 ∥∇xA(x)∥m1 (49) where dP (x0|xt) = p(x0|xt) dx0, (b) is the result of Lemma 3, (c) is from the intermediate value theorem, and (d) is from Proposition 2. B I NVERSE PROBLEM SETUP Super-resolution. The forward model for super-resolution is defined as y ∼ N (y|Lf x, σ2I), (Gaussian) (50) y ∼ P (y|Lf x; λ), (Poisson) (51) where Lf ∈ Rn×d represents the bicubic downsampling block Hankel matrix with the factor f, and P denotes the Poisson distribution with the parameter λ. Inpainting. For both box-type and random-type inpainting, the forward model reads y ∼ N (y|P x, σ2I), (Gaussian) (52) y ∼ P (y|P x; λ), (Poisson) (53) where P ∈ {0, 1}n×d is the masking matrix that consists of elementary unit vectors."},{"citing_arxiv_id":"2209.03003","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow","primary_cat":"cs.LG","submitted_at":"2022-09-07T08:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Rectified flow learns straight-path neural ODEs for distribution transport, yielding efficient generative models and domain transfers that work well even with a single simulation step.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2208.01626","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prompt-to-Prompt Image Editing with Cross Attention Control","primary_cat":"cs.CV","submitted_at":"2022-08-02T17:55:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Cross-attention control in text-conditioned models enables localized and global image edits by editing only the input text prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2208.01618","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion","primary_cat":"cs.CV","submitted_at":"2022-08-02T17:50:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Textual Inversion learns a single embedding vector from a few images to represent personal concepts inside the text embedding space of a frozen text-to-image model, enabling their composition in natural language prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2206.10789","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scaling Autoregressive Models for Content-Rich Text-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2022-06-22T01:11:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scaling an autoregressive Transformer to 20B parameters for text-to-image generation using image token sequences achieves new SOTA zero-shot FID of 7.23 and fine-tuned FID of 3.22 on MS-COCO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2205.11487","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding","primary_cat":"cs.CV","submitted_at":"2022-05-23T17:42:53+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Imagen achieves state-of-the-art photorealistic text-to-image generation by scaling a text-only pretrained T5 language model within a diffusion framework, reaching FID 7.27 on COCO without training on it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.06125","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","primary_cat":"cs.CV","submitted_at":"2022-04-13T01:10:33+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A hierarchical prior-decoder model using CLIP latents generates more diverse text-conditional images than direct methods while preserving photorealism and caption fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.03458","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2022-04-07T14:08:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A diffusion model for video generation extends image architectures with joint image-video training and improved conditional sampling, delivering first large-scale text-to-video results and state-of-the-art performance on video prediction and unconditional generation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2112.10752","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"High-Resolution Image Synthesis with Latent Diffusion Models","primary_cat":"cs.CV","submitted_at":"2021-12-20T18:55:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Latent diffusion models achieve state-of-the-art inpainting and competitive results on unconditional generation, scene synthesis, and super-resolution by performing the diffusion process in the latent space of pretrained autoencoders with cross-attention conditioning, while cutting computational and","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}