{"total":16,"items":[{"citing_arxiv_id":"2605.11367","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"3D-Belief: Embodied Belief Inference via Generative 3D World Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-12T00:42:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"3D-Belief maintains and updates explicit 3D beliefs about partially observed environments to enable multi-hypothesis imagination and improved performance on embodied tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09644","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Attention Itself Could Retrieve.RetrieveVGGT: Training-Free Long Context Streaming 3D Reconstruction via Query-Key Similarity Retrieval","primary_cat":"cs.CV","submitted_at":"2026-05-10T16:41:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RetrieveVGGT enables constant-memory long-context streaming 3D reconstruction by retrieving relevant frames via query-key similarities in VGGT's first attention layer, outperforming StreamVGGT and others.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08371","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PaceVGGT: Pre-Alternating-Attention Token Pruning for Visual Geometry Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-08T18:27:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PaceVGGT reduces VGGT inference latency by up to 5.1x on ScanNet-50 via pre-AA token pruning with a distilled Token Scorer, per-frame keep budgets, adaptive merge/prune, and feature-guided restoration, while preserving reconstruction quality on ScanNet-50 and 7-Scenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06270","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spark3R: Asymmetric Token Reduction Makes Fast Feed-Forward 3D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-07T13:45:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Asymmetric token reduction, with distinct merging for queries and pruning for key-values plus layer-wise adaptation, delivers up to 28x speedup on 1000-frame 3D reconstruction inputs while preserving competitive quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21915","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Vista4D: Video Reshooting with 4D Point Clouds","primary_cat":"cs.CV","submitted_at":"2026-04-23T17:57:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vista4D re-synthesizes dynamic videos from new viewpoints by grounding them in a 4D point cloud built with static segmentation and multiview training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18260","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Geometry-Guided 3D Visual Token Pruning for Video-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-20T13:33:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Geo3DPruner uses geometry-aware global attention and two-stage voxel pruning to remove 90% of visual tokens from spatial videos while keeping over 90% of original performance on 3D scene benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15284","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GlobalSplat: Efficient Feed-Forward 3D Gaussian Splatting via Global Scene Tokens","primary_cat":"cs.CV","submitted_at":"2026-04-16T17:52:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GlobalSplat achieves competitive novel-view synthesis on RealEstate10K and ACID using only 16K Gaussians via global scene tokens and coarse-to-fine training, with a 4MB footprint and under 78ms inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15237","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StreamCacheVGGT: Streaming Visual Geometry Transformers with Robust Scoring and Hybrid Cache Compression","primary_cat":"cs.CV","submitted_at":"2026-04-16T17:12:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StreamCacheVGGT improves streaming 3D geometry reconstruction accuracy and stability under fixed memory by using cross-layer token importance scoring and hybrid cache compression instead of pure eviction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14025","ref_index":103,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Feed-Forward 3D Scene Modeling: A Problem-Driven Perspective","primary_cat":"cs.CV","submitted_at":"2026-04-15T16:07:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper proposes a problem-driven taxonomy for feed-forward 3D scene modeling that groups methods by five core challenges: feature enhancement, geometry awareness, model efficiency, augmentation strategies, and temporal-aware modeling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10098","ref_index":195,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-04-11T08:41:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The first survey on Attention Sink in Transformers structures the literature around fundamental utilization, mechanistic interpretation, and strategic mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08542","ref_index":101,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scal3R: Scalable Test-Time Training for Large-Scale 3D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scal3R achieves better accuracy and consistency in large-scale 3D scene reconstruction by maintaining a compressed global context through test-time adaptation of lightweight neural networks on long video sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07350","ref_index":78,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Fast Spatial Memory with Elastic Test-Time Training","primary_cat":"cs.CV","submitted_at":"2026-04-08T17:59:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Elastic Test-Time Training stabilizes test-time updates via an elastic prior and moving-average anchor, enabling Fast Spatial Memory for scalable long-sequence 4D reconstruction with reduced memory use and fewer shortcuts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07279","ref_index":80,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mem3R: Streaming 3D Reconstruction with Hybrid Memory via Test-Time Training","primary_cat":"cs.CV","submitted_at":"2026-04-08T16:41:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mem3R achieves better long-sequence 3D reconstruction by decoupling tracking and mapping with a hybrid memory of TTT-updated MLP and explicit tokens, reducing model size and trajectory errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05351","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AnyImageNav: Any-View Geometry for Precise Last-Meter Image-Goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-04-07T02:45:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AnyImageNav uses a semantic-to-geometric cascade with 3D multi-view foundation models to recover precise 6-DoF poses from goal images, achieving 0.27m position error and state-of-the-art success rates on Gibson and HM3D benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.00813","ref_index":92,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DVGT-2: Vision-Geometry-Action Model for Autonomous Driving at Scale","primary_cat":"cs.CV","submitted_at":"2026-04-01T12:21:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DVGT-2 is a streaming vision-geometry-action model that jointly reconstructs dense 3D geometry and plans trajectories online, achieving better reconstruction than prior batch methods while transferring directly to planning benchmarks without fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.10647","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Depth Anything 3: Recovering the Visual Space from Any Views","primary_cat":"cs.CV","submitted_at":"2025-11-13T18:59:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DA3 recovers consistent visual geometry from arbitrary views via a vanilla DINO transformer and depth-ray target, setting new SOTA on a visual geometry benchmark while outperforming DA2 on monocular depth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}