{"total":13,"items":[{"citing_arxiv_id":"2605.12500","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SenseNova-U1: Unifying Multimodal Understanding and Generation with NEO-unify Architecture","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SenseNova-U1 presents native unified multimodal models that match top understanding VLMs while delivering strong performance in image generation, infographics, and interleaved tasks via the NEO-unify architecture.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05781","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Steering Visual Generation in Unified Multimodal Models with Understanding Supervision","primary_cat":"cs.CV","submitted_at":"2026-05-07T07:20:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Using understanding tasks as direct supervision during post-training improves image generation and editing in unified multimodal models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05646","ref_index":98,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MUSE: Resolving Manifold Misalignment in Visual Tokenization via Topological Orthogonality","primary_cat":"cs.CV","submitted_at":"2026-05-07T03:53:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MUSE decouples reconstruction and semantic learning in visual tokenization via topological orthogonality, yielding SOTA generation quality and improved semantic performance over its teacher model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25636","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Refinement via Regeneration: Enlarging Modification Space Boosts Image Refinement in Unified Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-04-28T13:36:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Refinement via Regeneration (RvR) reformulates image refinement in unified multimodal models as conditional regeneration using prompt and semantic tokens from the initial image, yielding higher alignment scores than editing-based methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21926","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Seeing Without Eyes: 4D Human-Scene Understanding from Wearable IMUs","primary_cat":"cs.CV","submitted_at":"2026-04-23T17:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IMU-to-4D uses wearable IMU data and repurposed LLMs to predict coherent 4D human motion plus coarse scene structure, outperforming cascaded state-of-the-art pipelines in temporal stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07422","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Multimodal Large Language Models for Multi-Subject In-Context Image Generation","primary_cat":"cs.LG","submitted_at":"2026-04-08T15:37:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MUSIC is the first MLLM for multi-subject in-context image generation that uses an automatic data pipeline, vision chain-of-thought reasoning, and semantics-driven spatial layout planning to outperform prior methods on a new MSIC benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.15564","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Show-o2: Improved Native Unified Multimodal Models","primary_cat":"cs.CV","submitted_at":"2025-06-18T15:39:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Show-o2 unifies text, image, and video understanding and generation in a single autoregressive-plus-flow-matching model built on 3D causal VAE representations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.03147","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UniWorld-V1: High-Resolution Semantic Encoders for Unified Visual Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2025-06-03T17:59:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UniWorld-V1 shows that semantic features from large multimodal models enable unified visual understanding and generation, achieving strong results on perception and manipulation tasks with only 2.7 million training samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.14683","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Emerging Properties in Unified Multimodal Pretraining","primary_cat":"cs.CV","submitted_at":"2025-05-20T17:59:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BAGEL is a unified decoder-only model that develops emerging complex multimodal reasoning abilities after pretraining on large-scale interleaved data and outperforms prior open-source unified models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.09568","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BLIP3-o: A Family of Fully Open Unified Multimodal Models-Architecture, Training and Dataset","primary_cat":"cs.CV","submitted_at":"2025-05-14T17:11:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BLIP3-o uses a diffusion transformer to generate CLIP image features and a sequential pretraining strategy to build open models that perform strongly on both image understanding and generation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.17811","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling","primary_cat":"cs.AI","submitted_at":"2025-01-29T18:00:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Scaling data, model size, and training optimization on the Janus architecture yields better multimodal understanding and more stable, instruction-following text-to-image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2409.18869","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Emu3: Next-Token Prediction is All You Need","primary_cat":"cs.CV","submitted_at":"2024-09-27T16:06:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emu3 shows that next-token prediction on a unified discrete token space for text, images, and video lets a single transformer outperform task-specific models such as SDXL and LLaVA-1.6 in multimodal generation and perception.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.12528","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Show-o: One Single Transformer to Unify Multimodal Understanding and Generation","primary_cat":"cs.CV","submitted_at":"2024-08-22T16:32:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Show-o unifies autoregressive and discrete diffusion modeling inside one transformer to support multimodal understanding and generation tasks with competitive benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}