{"total":20,"items":[{"citing_arxiv_id":"2605.13047","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Revealing the Gap in Human and VLM Scene Perception through Counterfactual Semantic Saliency","primary_cat":"cs.CV","submitted_at":"2026-05-13T06:11:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLMs exhibit size, center, and saliency biases in scene understanding, relying less on people than humans do, with size bias as a key driver of divergence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12013","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"L2P: Unlocking Latent Potential for Pixel Generation","primary_cat":"cs.CV","submitted_at":"2026-05-12T12:01:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"L2P repurposes pre-trained LDMs for direct pixel generation via large-patch tokenization and shallow-layer training on synthetic data, matching source performance with 8-GPU training and enabling native 4K output.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10019","ref_index":80,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The two clocks and the innovation window: When and how generative models learn rules","primary_cat":"cs.LG","submitted_at":"2026-05-11T05:44:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Generative models learn rules before memorizing data, creating an innovation window whose width depends on dataset size and rule complexity, observed in both diffusion and autoregressive architectures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09313","ref_index":17,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Attention Sinks in Diffusion Transformers: A Causal Analysis","primary_cat":"cs.CV","submitted_at":"2026-05-10T04:14:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Suppressing attention sinks in diffusion transformers does not degrade CLIP-T alignment at moderate levels but induces sink-specific perceptual shifts six times larger than equal-budget random masking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07915","ref_index":93,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Matters for Diffusion-Friendly Latent Manifold? Prior-Aligned Autoencoders for Latent Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:52:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prior-Aligned AutoEncoders shape latent manifolds with spatial coherence, local continuity, and global semantics to improve latent diffusion, achieving SOTA gFID 1.03 on ImageNet 256x256 with up to 13x faster convergence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05646","ref_index":148,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MUSE: Resolving Manifold Misalignment in Visual Tokenization via Topological Orthogonality","primary_cat":"cs.CV","submitted_at":"2026-05-07T03:53:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MUSE decouples reconstruction and semantic learning in visual tokenization via topological orthogonality, yielding SOTA generation quality and improved semantic performance over its teacher model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05331","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ViTok-v2: Scaling Native Resolution Auto-Encoders to 5 Billion Parameters","primary_cat":"cs.CV","submitted_at":"2026-05-06T18:03:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViTok-v2 is a 5B-parameter native-resolution image autoencoder using NaFlex and DINOv3 loss that matches or exceeds prior tokenizers at 256p and outperforms them at 512p and above while advancing the Pareto frontier in joint scaling with generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05204","ref_index":104,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"D-OPSD: On-Policy Self-Distillation for Continuously Tuning Step-Distilled Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-06T17:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"D-OPSD enables continuous supervised fine-tuning of few-step diffusion models via on-policy self-distillation where the model acts as both teacher (multimodal context) and student (text-only context) on its own roll-outs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02641","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mamoda2.5: Enhancing Unified Multimodal Model with DiT-MoE","primary_cat":"cs.CV","submitted_at":"2026-05-04T14:26:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Mamoda2.5 is a 25B-parameter DiT-MoE unified AR-Diffusion model that reaches top video generation and editing benchmarks with 4-step inference up to 95.9x faster than baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24351","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Diffusion Templates: A Unified Plugin Framework for Controllable Diffusion","primary_cat":"cs.LG","submitted_at":"2026-04-27T11:44:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Diffusion Templates is a unified plugin framework that allows injecting various controllable capabilities into diffusion models through a standardized interface.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22865","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MeshLAM: Feed-Forward One-Shot Animatable Textured Mesh Avatar Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-23T09:01:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MeshLAM reconstructs high-fidelity animatable textured mesh head avatars from a single image via a feed-forward dual shape-texture architecture with iterative GRU decoding and reprojection-based guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21221","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sparse Forcing: Native Trainable Sparse Attention for Real-time Autoregressive Diffusion Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-23T02:22:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparse Forcing adds a native trainable sparsity mechanism and PBSA kernel to autoregressive diffusion video models, yielding higher VBench scores and 1.1-1.27x speedups on 5s to 1min generations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18468","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Asset Harvester: Extracting 3D Assets from Autonomous Driving Logs for Simulation","primary_cat":"cs.CV","submitted_at":"2026-04-20T16:20:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Asset Harvester converts sparse in-the-wild object observations from AV driving logs into complete simulation-ready 3D assets via data curation, geometry-aware preprocessing, and a SparseViewDiT model that couples sparse-view multiview generation with 3D Gaussian lifting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18168","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Extending One-Step Image Generation from Class Labels to Text via Discriminative Text Representation","primary_cat":"cs.CV","submitted_at":"2026-04-20T12:28:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"By requiring and using highly discriminative LLM text features, the work enables the first effective one-step text-conditioned image generation with MeanFlow.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12322","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Self-Adversarial One Step Generation via Condition Shifting","primary_cat":"cs.CV","submitted_at":"2026-04-14T05:54:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"APEX derives self-adversarial gradients from condition-shifted velocity fields in flow models to achieve high-fidelity one-step generation, outperforming much larger models and multi-step teachers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07026","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Not all tokens contribute equally to diffusion learning","primary_cat":"cs.CV","submitted_at":"2026-04-08T12:45:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DARE mitigates neglect of important tokens in conditional diffusion models via distribution-rectified guidance and spatial attention alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04646","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Training-Free Refinement of Flow Matching with Divergence-based Sampling","primary_cat":"cs.CV","submitted_at":"2026-04-06T12:54:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Flow Divergence Sampler refines flow matching by computing velocity field divergence to correct ambiguous intermediate states during inference, improving fidelity in text-to-image and inverse problem tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06061","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PromptEvolver: Prompt Inversion through Evolutionary Optimization in Natural-Language Space","primary_cat":"cs.LG","submitted_at":"2026-04-03T17:00:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PromptEvolver recovers high-fidelity natural language prompts for given images by evolving them via genetic algorithm guided by a vision-language model, outperforming prior methods on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08586","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FluidFlow: a flow-matching generative model for fluid dynamics surrogates on unstructured meshes","primary_cat":"cs.LG","submitted_at":"2026-03-30T10:08:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FluidFlow uses conditional flow-matching with U-Net and DiT architectures to predict pressure and friction coefficients on airfoils and 3D aircraft meshes, outperforming MLP baselines with better generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.03233","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LTX-2: Efficient Joint Audio-Visual Foundation Model","primary_cat":"cs.CV","submitted_at":"2026-01-06T18:24:41+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LTX-2 generates high-quality synchronized audiovisual content from text prompts via an asymmetric 14B-video / 5B-audio dual-stream transformer with cross-attention and modality-aware guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}