{"total":19,"items":[{"citing_arxiv_id":"2605.09976","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OZ-TAL: Online Zero-Shot Temporal Action Localization","primary_cat":"cs.CV","submitted_at":"2026-05-11T04:34:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Defines OZ-TAL task and presents a training-free VLM-based method that outperforms prior approaches for online and offline zero-shot temporal action localization on THUMOS14 and ActivityNet-1.3.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07915","ref_index":99,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Matters for Diffusion-Friendly Latent Manifold? Prior-Aligned Autoencoders for Latent Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:52:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prior-Aligned AutoEncoders shape latent manifolds with spatial coherence, local continuity, and global semantics to improve latent diffusion, achieving SOTA gFID 1.03 on ImageNet 256x256 with up to 13x faster convergence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04425","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Joint Semantic Token Selection and Prompt Optimization for Interpretable Prompt Learning","primary_cat":"cs.CV","submitted_at":"2026-05-06T02:38:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IPL alternates discrete semantic token selection using approximate submodular optimization with continuous prompt optimization to boost both interpretability and task performance in vision-language model adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01048","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Compared to What? Baselines and Metrics for Counterfactual Prompting","primary_cat":"cs.CL","submitted_at":"2026-05-01T19:23:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Counterfactual prompting effects on LLMs are often indistinguishable from those caused by meaning-preserving paraphrases, causing most previously reported demographic sensitivities to disappear under proper statistical comparison.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00809","ref_index":77,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Let ViT Speak: Generative Language-Image Pre-training","primary_cat":"cs.CV","submitted_at":"2026-05-01T17:51:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenLIP pretrains ViTs to generate language tokens from visual tokens via autoregressive language modeling, matching strong baselines on multimodal tasks with less data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21786","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Codebooks to VLMs: Evaluating Automated Visual Discourse Analysis for Climate Change on Social Media","primary_cat":"cs.CV","submitted_at":"2026-04-23T15:44:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VLMs recover reliable population-level trends in climate change visual discourse on social media even when per-image accuracy is only moderate.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11095","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bottleneck Tokens for Unified Multimodal Retrieval","primary_cat":"cs.LG","submitted_at":"2026-04-13T07:12:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Bottleneck Tokens paired with a masked generative objective achieve state-of-the-art unified multimodal retrieval performance among 2B-scale models on the MMEB-V2 benchmark with 78 datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08762","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InstrAct: Towards Action-Centric Understanding in Instructional Videos","primary_cat":"cs.CV","submitted_at":"2026-04-09T20:51:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"InstrAction pretrains video foundation models using action-centric data filtering, hard negatives, an Action Perceiver module, DTW-Align, and Masked Action Modeling to reduce static bias and outperform prior models on a new InstrAct Bench for semantic, procedural, and retrieval tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08337","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InstAP: Instance-Aware Vision-Language Pre-Train for Spatial-Temporal Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-09T15:10:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"InstAP introduces instance-aware pre-training with a new dual-granularity dataset InstVL that improves both fine-grained instance retrieval and global video understanding over standard VLP baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.18930","ref_index":194,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Hallucination of Multimodal Large Language Models: A Survey","primary_cat":"cs.CV","submitted_at":"2024-04-29T17:59:41+00:00","verdict":"ACCEPT","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The survey organizes causes of hallucinations in MLLMs, reviews evaluation benchmarks and metrics, and outlines mitigation approaches plus open questions.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"sively generate dispreferred feedback data using AI models. The dispreferred data is generated by: 1) utilizing GPT-4V to introduce plausible hallucinations into the answer, and 2) provoking inherent hallucination by introducing noise into MLLMs. In the DPO optimization framework, the ground-truth multimodal instructions serves as the preferred answers. RLAIF-V [194] argues that the most existing RLAIF frameworks rely on expensive proprietary models, limiting the scalability. To bridge the gap, this work proposes a solution to utilize fully open-sourced MLLM to generate high-quality feedback. CLIP-DPO [126] takes one step further by getting rid of offline data collection. Instead, starting from the initial pool of supervised fine-tuning"},{"citing_arxiv_id":"2402.17177","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sora: A Review on Background, Technology, Limitations, and Opportunities of Large Vision Models","primary_cat":"cs.CV","submitted_at":"2024-02-27T03:30:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"The paper reviews the background, technology, applications, limitations, and future directions of OpenAI's Sora text-to-video generative model based on public information.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"these issues by re-captioning existing images with detailed, descriptive captions. The approach first trains an image captioner, which is a vision-language model, to generate precise and descriptive image captions. The resulting descriptive image captions by the captioner are then used to fine-tune text-to-image models. Specifically, DALL·E 3 follows contrastive captioners (CoCa) [79] to jointly train an image captioner with a CLIP [26] architecture and a language model objective. This image captioner incorporates an image en- coder a unimodal text encoder for extracting language information, and a multimodal text decoder. It first employs a contrastive loss between unimodal image and text embeddings, followed by a captioning loss for"},{"citing_arxiv_id":"2404.08471","ref_index":293,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Revisiting Feature Prediction for Learning Visual Representations from Video","primary_cat":"cs.CV","submitted_at":"2024-02-15T18:59:11+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"V-JEPA models trained only on feature prediction from 2 million public videos achieve 81.9% on Kinetics-400, 72.2% on Something-Something-v2, and 77.9% on ImageNet-1K using frozen ViT-H/16 backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2312.14238","ref_index":170,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","primary_cat":"cs.CV","submitted_at":"2023-12-21T18:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InternVL scales a vision model to 6B parameters and aligns it with LLMs using web data to achieve state-of-the-art results on 32 visual-linguistic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.16588","ref_index":279,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Vision Transformers Need Registers","primary_cat":"cs.CV","submitted_at":"2023-09-28T16:45:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adding register tokens to Vision Transformers eliminates high-norm background artifacts and raises state-of-the-art performance on dense visual prediction tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2302.12192","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aligning Text-to-Image Models using Human Feedback","primary_cat":"cs.LG","submitted_at":"2023-02-23T17:34:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A three-stage fine-tuning process uses human ratings to train a reward model and then improves text-to-image alignment by maximizing reward-weighted likelihood.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2301.12597","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models","primary_cat":"cs.CV","submitted_at":"2023-01-30T00:56:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BLIP-2 bootstraps vision-language pre-training from frozen image encoders and LLMs via a lightweight two-stage Querying Transformer, delivering SOTA results with 54x fewer trainable parameters than Flamingo80B on zero-shot VQAv2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.08402","ref_index":92,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LAION-5B: An open large-scale dataset for training next generation image-text models","primary_cat":"cs.CV","submitted_at":"2022-10-16T00:08:18+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LAION-5B is an openly released dataset of 5.85 billion CLIP-filtered image-text pairs that enables replication of foundational vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2206.10789","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scaling Autoregressive Models for Content-Rich Text-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2022-06-22T01:11:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scaling an autoregressive Transformer to 20B parameters for text-to-image generation using image token sequences achieves new SOTA zero-shot FID of 7.23 and fine-tuned FID of 3.22 on MS-COCO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2205.11487","ref_index":80,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding","primary_cat":"cs.CV","submitted_at":"2022-05-23T17:42:53+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Imagen achieves state-of-the-art photorealistic text-to-image generation by scaling a text-only pretrained T5 language model within a diffusion framework, reaching FID 7.27 on COCO without training on it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}