{"total":24,"items":[{"citing_arxiv_id":"2605.26519","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$R^3$: 3D Reconstruction via Relative Regression","primary_cat":"cs.CV","submitted_at":"2026-05-26T04:03:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"R³ uses relative regression with confidence-weighted constraints from an MLP to support long-context offline and streaming 3D reconstruction without global coordinate assumptions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26103","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Global Structure-from-Motion Meets Feedforward Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-25T17:58:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A new SfM pipeline combining classical and feedforward methods reports state-of-the-art results across multiple datasets and is released as open source.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23889","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HorizonStream: Long-Horizon Attention for Streaming 3D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:50:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HorizonStream is a long-horizon Transformer that factorizes geometric evidence influence into channel-wise linear attention for long-range temporal propagation and local spatiotemporal attention for short-range matching, claiming stable generalization from 48-frame training to over 10,000-frame test","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21472","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stream3D: Sequential Multi-View 3D Generation via Evidential Memory","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:55:16+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19257","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PRISM-SLAM: Probabilistic Ray-Grounded Inference for Scale-aware Metric SLAM","primary_cat":"cs.RO","submitted_at":"2026-05-19T02:10:04+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17478","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Mamba-VGGT: Persistent Long-Sequence Video Geometry Grounded Transformer via External Sliding Window Mamba Memory","primary_cat":"cs.CV","submitted_at":"2026-05-17T14:34:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mamba-VGGT introduces a Sliding Window Mamba memory module and Zero-Init Spatial Memory Injector to enable persistent long-range geometric reasoning in VGGT for extended video sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17327","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient Feature-Free Initialization for Monocular Visual-Inertial Systems Using a Feed-Forward 3D Model","primary_cat":"cs.RO","submitted_at":"2026-05-17T08:35:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A feature-free monocular VINS initialization method that uses feed-forward 3D model point cloud predictions achieves over 90% success rate with under 1.2 seconds of data and performs robustly in degraded environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15186","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VGGT-Edit: Feed-forward Native 3D Scene Editing with Residual Field Prediction","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VGGT-Edit proposes a native 3D text-conditioned editing framework using depth-synchronized injection and residual field prediction, plus the DeltaScene dataset, outperforming 2D-lifting methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05749","ref_index":23,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ray-Aware Pointer Memory with Adaptive Updates for Streaming 3D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-07T06:42:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper proposes ray-aware pointer memory with adaptive retain-or-replace updates to improve long-term stability and pose accuracy in streaming 3D reconstruction.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"These tasks collectively assess the abil- ity of the system to maintainaccurate geometry and stable pose estimation over time. Datasets.For reconstruction evaluation, we use 7-Scenes and NRGBD. These datasets contain indoor scenes with challenging viewpoint changes and repeated structures. Depth estimation is evaluated on NYU-v2 [ 28], Sintel [ 4], Bonn [ 23], and KITTI [ 11]. Camera pose estimation is evaluated on ScanNet [7], Sintel [4], and TUM- dynamic [29]. These datasets cover bothstatic and dynamic scenes, allowing us to analyze the robustness of the proposed method across diverse environments. 4.2 Reconstruction Results Table 1 Quantitative comparison on the7-Scenes [27]andNRGBD [ 3] datasets for dense 3D reconstruction."},{"citing_arxiv_id":"2604.26567","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AirZoo: A Unified Large-Scale Dataset for Grounding Aerial Geometric 3D Vision","primary_cat":"cs.CV","submitted_at":"2026-04-29T11:52:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AirZoo is a new large-scale synthetic dataset for aerial 3D vision that improves state-of-the-art models on image retrieval, cross-view matching, and 3D reconstruction when used for fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26067","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RADIO-ViPE: Online Tightly Coupled Multi-Modal Fusion for Open-Vocabulary Semantic SLAM in Dynamic Environments","primary_cat":"cs.CV","submitted_at":"2026-04-28T19:09:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RADIO-ViPE performs online open-vocabulary semantic SLAM directly from monocular RGB video in dynamic environments by tightly coupling vision-language embeddings from foundation models with geometric factor-graph optimization using adaptive robust kernels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14795","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Keep It CALM: Toward Calibration-Free Kilometer-Level SLAM with Visual Geometry Foundation Models via an Assistant Eye","primary_cat":"cs.RO","submitted_at":"2026-04-16T08:58:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CAL2M achieves calibration-free kilometer-level SLAM by using an assistant eye for scale, epipolar-guided intrinsic correction, and anchor propagation for nonlinear sub-map alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10593","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MonoEM-GS: Monocular Expectation-Maximization Gaussian Splatting SLAM","primary_cat":"cs.RO","submitted_at":"2026-04-12T11:51:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MonoEM-GS stabilizes view-dependent geometry from foundation models inside a global Gaussian Splatting representation via EM and adds multi-modal features for in-place open-set segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08542","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scal3R: Scalable Test-Time Training for Large-Scale 3D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scal3R achieves better accuracy and consistency in large-scale 3D scene reconstruction by maintaining a compressed global context through test-time adaptation of lightweight neural networks on long video sequences.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Best and second-best results are shown inboldand underlined . Method VKITTI2 KITTI Oxford Spires Resources 223-837 frames, 52-711 meters 271-4661 frames, 394-5067 meters 351-787 frames, 280-773 meters KITTI 03/04/10, avg. 758 frames RRE↓RTE↓ATE↓RRE↓RTE↓ATE↓RRE↓RTE↓ATE↓Memory↓Time↓FPS↑ MASt3R-SLAM [44] 15.81 70.48 78.33 22.42 67.72 191.71 59.67 29.82 29.22 6.74 99.30 7.37 VGGT-SLAM [41] 12.92 21.27 17.18 33.27 78.95 214.88 55.60 32.14 26.85 10.67 39.72 19.85 StreamVGGT [101] 13.47 58.07 68.97 24.06 84.46 226.15 71.28 37.14 34.35 6.66 32.61 23.14 STream3R [33] 13.46 76.06 70.87 24.06 81.63 227.77 71.29 36.65 34.65 4.70 111.23 8.19 CUT3R [80] 7.93 40.42 50.75 24.24 73.65 209.78 54.69 32.15 28.01 6.50 22.96 32.87 TTT3R [11] 5.88 16."},{"citing_arxiv_id":"2604.06830","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VGGT-SLAM++","primary_cat":"cs.CV","submitted_at":"2026-04-08T08:48:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VGGT-SLAM++ improves on prior transformer SLAM by adding dense DEM submap graphs and high-cadence local optimization, achieving SOTA accuracy with reduced drift and bounded memory on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04667","ref_index":11,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ZeD-MAP: Bundle Adjustment Guided Zero-Shot Depth Maps for Real-Time Aerial Imaging","primary_cat":"cs.CV","submitted_at":"2026-04-06T13:21:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZeD-MAP integrates incremental cluster-based bundle adjustment with zero-shot diffusion depth estimation to deliver metrically consistent real-time depth maps from high-resolution UAV imagery.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.20496","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Metric, inertially aligned monocular state estimation via kinetodynamic priors","primary_cat":"cs.RO","submitted_at":"2025-11-25T17:03:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The method combines a learned deformation model, continuous B-spline kinematics, and Newton's Second Law to enable accurate pose estimation and metric scale plus gravity recovery in monocular visual odometry on non-rigid platforms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.17207","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SING3R-SLAM: Submap-based Indoor Monocular Gaussian SLAM with 3D Reconstruction Priors","primary_cat":"cs.CV","submitted_at":"2025-11-21T12:40:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SING3R-SLAM adds submap-level global alignment and reconstruction priors to a Gaussian map to reduce drift and improve local geometry in monocular indoor SLAM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.10647","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Depth Anything 3: Recovering the Visual Space from Any Views","primary_cat":"cs.CV","submitted_at":"2025-11-13T18:59:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DA3 recovers consistent visual geometry from arbitrary views via a vanilla DINO transformer and depth-ray target, setting new SOTA on a visual geometry benchmark while outperforming DA2 on monocular depth.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This work laid the foundation for subsequent transformer-based methods aiming to unify multi-view geometry estimation at scale. Follow-up models extended this paradigm with multi-view inputs [10, 85, 94, 110], video input [19, 59, 94, 121], robust correspondence modeling [ 48], camera parameter injection [39, 43], large-scale SfM [18], SLAM applications [54], and view synthesis with 3D Gaussians [11, 13, 41, 79, 108, 122]. Among these, [91] push accuracy to a new level through large-scale training, a multi-stage architecture, and redundancy in design. In contrast, we focus on a minimal modeling strategy built around a single, simple transformer. Monocular depth estimation.Early monocular depth estimation methods relied on fully supervised learning"},{"citing_arxiv_id":"2509.26645","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TTT3R: 3D Reconstruction as Test-Time Training","primary_cat":"cs.CV","submitted_at":"2025-09-30T17:59:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TTT3R derives a closed-form learning rate from memory-observation alignment confidence to boost length generalization in RNN-based 3D reconstruction by 2x in global pose estimation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.10934","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ViPE: Video Pose Engine for 3D Geometric Perception","primary_cat":"cs.CV","submitted_at":"2025-08-12T18:39:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ViPE estimates camera intrinsics, motion, and dense near-metric depth from uncalibrated videos, outperforming baselines on TUM and KITTI while releasing annotations for 96M frames across real and generated videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.17596","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PRIX: Learning to Plan from Raw Pixels for End-to-End Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2025-07-23T15:28:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRIX presents an efficient camera-only planner with a novel CaRT module that matches larger multimodal models on NavSim and nuScenes while reducing model size and inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.16443","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VGGT-Long: Chunk it, Loop it, Align it -- Pushing VGGT's Limits on Kilometer-scale Long RGB Sequences","primary_cat":"cs.CV","submitted_at":"2025-07-22T10:39:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VGGT-Long extends VGGT with chunking, overlap alignment, and loop closure to produce consistent kilometer-scale 3D reconstructions from monocular RGB sequences without retraining or extra supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.07982","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Geometry Forcing: Marrying Video Diffusion and 3D Representation for Consistent World Modeling","primary_cat":"cs.CV","submitted_at":"2025-07-10T17:55:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Geometry Forcing aligns video diffusion representations with geometric foundation model features via angular cosine and scale regression objectives to improve 3D consistency in generated videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}