{"total":58,"items":[{"citing_arxiv_id":"2606.31467","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AeroVerse-SatAgent: UAV-Satellite Collaborative Spatial Reasoning Inspired by the Dual Visual Pathway Theory of Cognitive Neuroscience","primary_cat":"cs.CV","submitted_at":"2026-06-30T10:46:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SatAgent is a UAV-satellite collaborative spatial reasoning model using geometric 3D encoding, multi-view alignment, and a new 130K dataset that reports 25.91% and 11.69% gains over general and specialized baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31144","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Modular Vision-Language-Action Robotics Framework for Indoor Environments","primary_cat":"cs.RO","submitted_at":"2026-06-30T05:17:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Describes a modular VLA framework with semantic voxel mapping via OwlViT and VLM-based command classification and grounding for the CMU VLA Challenge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30367","ref_index":78,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FutureNav: Unified World-Action Modeling for Vision-and-Language Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-29T14:33:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FutureNav proposes a 4B-scale VLM that jointly optimizes action prediction, inverse/forward dynamics, and future state generation for VLN and reports SOTA results on multiple benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29934","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RoamFlow: Reinforcement-Aligned One-Step Action MeanFlow Policy for Image-Goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-29T08:10:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RoamFlow applies MeanFlow to predict average velocity fields for one-step action policies in image-goal navigation, trained via expert imitation followed by RL refinement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29917","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Flying to Image-Specified Objects: 3D Quadrotor Navigation via Cross-Graph Memory and Viewpoint Planning","primary_cat":"cs.RO","submitted_at":"2026-06-29T07:54:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes a hierarchical navigation framework with viewpoint-aware action nodes, cross-graph memory, and learning-based policy for quadrotor InstanceImageNav, claiming improvements over baselines in simulation and real-world validation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27745","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Panoramic Scene Analysis: A Survey from Distortion-Aware Engineering to Sphere-Native Foundation Modeling","primary_cat":"cs.CV","submitted_at":"2026-06-26T05:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Survey organizing panoramic scene analysis literature by architectural design and training paradigm, identifying the absence of methods achieving both strict spherical equivariance and full reuse of perspective-pretrained weights, plus five evaluation protocol gaps and a six-point roadmap.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01621","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Goal2Pixel: Grounding Goals to Pixels for Vision-Language Navigation","primary_cat":"cs.CV","submitted_at":"2026-06-01T03:12:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Goal2Pixel grounds VLN-CE goals to image pixels via VLM prediction plus keyframe memory, reaching 54.1% SR on R2R-CE Val-Unseen with 7.75 calls per episode versus 46.62 for action prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01313","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PSG-Nav: Probabilistic Scene Graph Navigation via Multiverse Decision Making","primary_cat":"cs.RO","submitted_at":"2026-05-31T16:00:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PSG-Nav introduces a probabilistic scene graph with multiverse sampling and an evidential calibrator to achieve new state-of-the-art success rates of 66.1%, 44.8%, and 67.9% on MP3D, HM3D, and HSSD open-vocabulary navigation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30639","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PInVerify: An Offline Embodied Benchmark for Active Instance Verification","primary_cat":"cs.CV","submitted_at":"2026-05-28T22:42:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PInVerify is a new offline embodied benchmark for active instance verification that supplies multi-view captures and 6-sector navigation topology, with MLLM baselines reaching 85.6% after fine-tuning but showing no reliable benefit from tested next-best-view strategies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30561","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLM3: Vision Language Models Are Native 3D Learners","primary_cat":"cs.CV","submitted_at":"2026-05-28T20:48:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Standard VLMs achieve expert-level 3D performance on depth estimation, pose estimation, and object understanding via three simple techniques without architecture changes or regression losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30161","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Far Looks Up: Probing Spatial Representation in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-28T16:18:01+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VLMs exhibit consistent vertical-distance entanglement in embeddings from perspective bias in natural images, producing accuracy gaps that a new synthetic benchmark SpatialTunnel exposes as model-intrinsic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27367","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpatialBench: Is Your Spatial Foundation Model an All-Round Player?","primary_cat":"cs.CV","submitted_at":"2026-05-26T17:59:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SpatialBench evaluates 41 spatial foundation models across 6 paradigms and 5 task suites, finds they are not all-round players, and introduces the DA-Next-5M dataset plus DA-Next baseline model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23281","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DepthAgent: Towards Better Universal Depth Estimation via Sample-wise Expert Selection","primary_cat":"cs.CV","submitted_at":"2026-05-22T06:47:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A reinforcement-learned vision-language agent adaptively selects and fuses monocular depth experts per sample for better performance across camera geometries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23165","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Autonomous Frontier-Based Exploration with VLM Guidance","primary_cat":"cs.RO","submitted_at":"2026-05-22T02:33:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A VLM-based method for selecting exploration frontiers in robotics achieves up to 24% better map coverage than standard geometric heuristics in simulated indoor environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22036","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GA-VLN: Geometry-Aware BEV Representation for Efficient Vision-Language Navigation","primary_cat":"cs.CV","submitted_at":"2026-05-21T06:20:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GA-VLN builds a geometry-aware BEV representation from RGB-D inputs plus 3D foundation model features to deliver state-of-the-art vision-language navigation using only navigation data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21131","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniT: Unified Geometry Learning with Group Autoregressive Transformer","primary_cat":"cs.CV","submitted_at":"2026-05-20T13:04:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniT unifies online and offline 3D geometry perception via a Group Autoregressive Transformer that processes observation groups with anchor-free point map prediction and a scale-adaptive loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19797","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Depth2Pose: A Pose-Based Benchmark for Monocular Depth Estimation without Ground-Truth Depth","primary_cat":"cs.CV","submitted_at":"2026-05-19T12:59:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Depth2Pose is a new evaluation framework for monocular depth estimators that uses relative camera pose accuracy as a task-driven proxy and introduces the D2P dataset of challenging out-of-distribution scenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18729","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robo-Cortex: A Self-Evolving Embodied Agent via Dual-Grain Cognitive Memory and Autonomous Knowledge Induction","primary_cat":"cs.RO","submitted_at":"2026-05-18T17:52:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Robo-Cortex proposes a self-evolving embodied navigation agent using dual-grain cognitive memory and autonomous knowledge induction from trajectories, reporting SPL gains on IGNav, AR, AEQA and preliminary real-robot tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17102","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VoxScene: Anchor-Conditioned Voxel Diffusion for Indoor Scene Arrangement","primary_cat":"cs.GR","submitted_at":"2026-05-16T18:10:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VoxScene is a new anchor-conditioned voxel diffusion model that synthesizes collision-free 3D indoor scene arrangements via discrete volumetric occupancies and uses the grids for asset retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16937","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DEVIS-GRPO: Unleashing GRPO on Dynamic Extreme View Synthesis","primary_cat":"cs.CV","submitted_at":"2026-05-16T11:14:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DEVIS-GRPO applies online policy gradients with an accumulative small-to-large view sampling strategy and multi-level rewards to improve trajectory-controlled extreme view video generation, reporting gains on Kubric-4D, iPhone, and DL3DV datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15876","ref_index":10,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unlocking Dense Metric Depth Estimation in VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-15T11:54:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DepthVLM converts a standard VLM into a dense metric depth predictor by attaching a lightweight head and training under unified vision-text supervision, outperforming prior VLMs and some pure vision models on a new indoor-outdoor benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14135","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PanoPlane: Plane-Aware Panoramic Completion for Sparse-View Indoor 3D Gaussian Splatting","primary_cat":"cs.CV","submitted_at":"2026-05-13T21:39:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PanoPlane achieves up to 17.8% PSNR gains in sparse-view indoor novel view synthesis by using training-free plane-aware panoramic completion to supervise 3D Gaussian Splatting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13586","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HetScene: Heterogeneity-Aware Diffusion for Dense Indoor Scene Generation","primary_cat":"cs.CV","submitted_at":"2026-05-13T14:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HetScene proposes a two-stage heterogeneous diffusion framework that decomposes scenes into primary structural objects and secondary contextual objects to generate denser, more plausible indoor layouts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11762","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NavOL: Navigation Policy with Online Imitation Learning","primary_cat":"cs.RO","submitted_at":"2026-05-12T08:33:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NavOL collects expert trajectory labels online from a global planner during policy rollouts in simulation to train a diffusion navigation policy, mitigating distribution shift and improving performance on visual navigation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09441","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Isolation: A Unified Benchmark for General-Purpose Navigation","primary_cat":"cs.RO","submitted_at":"2026-05-10T09:34:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniNavBench is a unified benchmark for general-purpose navigation featuring composite multi-skill instructions, support for humanoid, quadrupedal and wheeled robots, and 1779 human teleoperated trajectories across 170 environments.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"algorithmic needs, the system features a highly modular in- terface for customizing sensor count, position, and orientation (e.g., RGB-D cameras, LiDAR, panoramic vision systems). This is supported by a hybrid suite of 170 environments, blending 85 high-fidelity synthetic assets from GRScenes [27] with 85 photorealistic real-world scans from Matterport3D [5]. (3) Human Expert Demonstrations:We move beyond the limitations of mechanical shortest-path algorithms by curating a comprehensive dataset of 1,779 expert trajectories collected through human teleoperation. These demonstrations, which span an average length of 16.7 meters and a cumulative distance of 29.5 kilometers, encapsulate the subtle decision-"},{"citing_arxiv_id":"2605.09146","ref_index":101,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Thinking: Imagining in 360$^\\circ$ for Humanoid Visual Search","primary_cat":"cs.CV","submitted_at":"2026-05-09T20:10:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Imagining in 360° decouples visual search into a single-step probabilistic semantic layout predictor and an actor, removing the need for multi-turn CoT reasoning and trajectory annotations while improving efficiency in 360° environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07550","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mind the Gap: Geometrically Accurate Generative Reconstruction from Disjoint Views","primary_cat":"cs.CV","submitted_at":"2026-05-08T10:24:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"GLADOS reconstructs 3D geometry from disjoint views by generating intermediate perspectives, performing robust coarse alignment that tolerates generative inconsistencies, and iteratively expanding context for consistency.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"construct a unified command map, thereby eliminating the burden of forcing them to waste critical mission time on capturing overlapping terrain. Consumer-Level Capture and Virtual Tours.The creation of virtual real estate tours or digital twins often relies on casual users capturing images with their smartphones in highly unconstrained environments [23]. A typical user might photograph the front of a living room and then move on to the kitchen, completely omitting the connecting hallway or other transitional spaces. Traditional pho- togrammetry and structure-from-motion pipelines [24], as well as modern few-shot neural rendering methods [25], fail here due to a lack of dense visual overlap, treating the rooms as disconnected float-"},{"citing_arxiv_id":"2605.05960","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Plug-and-Play Label Map Diffusion for Universal Goal-Oriented Navigation","primary_cat":"cs.RO","submitted_at":"2026-05-07T10:06:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PLMD applies a denoising diffusion model to predict labels for unknown map regions, allowing goal localization in unexplored environments by substituting completed labels into existing navigation pipelines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04128","ref_index":14,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"JoyAI-Image: Awaking Spatial Intelligence in Unified Multimodal Understanding and Generation","primary_cat":"cs.GR","submitted_at":"2026-05-05T15:49:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"JoyAI-Image unifies visual understanding and generation via an MLLM-MMDiT architecture with spatial training signals to reach competitive benchmark performance and stronger spatial intelligence.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Measurement (SM), Spatial Relationship (SR), Camera Perception (CP), Multi-view Consistency (MC), and Scene-Aware Reasoning (SAR), as illustrated in Figure 6, across 19 diverse sub-tasks, establishing an extensible cornerstone for general-purpose spatial understanding. The data engine ingests a variety of sources, encompassing high-precision 3D indoor scans (e.g., ScanNet [26], Matterport3D [14], ARKitScenes [5], ScanNet++ [99], and Hypersim [67]) in addition to the aforementioned web-scale video sequences. To maintain a consistent geometric foundation, all ingested assets are normalized within a canonical coordinate system. The technical workflow of OpenSpatial begins with the acquisition of scene-level 3D oriented bounding boxes (OBBs), obtained either through manual curation or the 3D lifting procedure."},{"citing_arxiv_id":"2604.27620","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SpaAct: Spatially-Activated Transition Learning with Curriculum Adaptation for Vision-Language Navigation","primary_cat":"cs.CV","submitted_at":"2026-04-30T09:09:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpaAct activates spatial awareness in VLMs using action retrospection, future frame prediction, and progressive curriculum learning to reach SOTA on VLN-CE benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"though modern VLMs may also be exposed to videos or multi-frame ∗Co-first authors. †Project leader. ‡Corresponding author. data during pre-training, their training tasks mainly strengthen semantic understanding that focuses on what is or happens in the image or video. However, in embodied navigation, the model needs to interact with 3D environments [9, 53]. Therefore, it's much more important to build dynamic spatial awareness that understands why the observation evolves over time and how the observation will change with the action executed. To help them build such awareness, we argue that it's required to equip VLMs with two complementary capabilities:backward action reasoningandforward transition prediction."},{"citing_arxiv_id":"2604.23629","ref_index":96,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Visual Synthesis to Interactive Worlds: Toward Production-Ready 3D Asset Generation","primary_cat":"cs.GR","submitted_at":"2026-04-26T09:44:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper surveys 3D asset generation methods and organizes them around the full production pipeline to assess which outputs meet engine-level requirements for interactive applications.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"M TxPBRUV/TopoPBRUV O ABO [90] '22 8K home goods M TxPBR/TopoPBR O OmniObj3D [91] '23 6K; 190 cat.MPCMV Tx/Par. O MVImgNet [92] '23 6.5M fr.; 238 cat.MVPC-/- O uCO3D [93] '25 170K seq.; 1K cat.MVGS Lg/- O Scenes- Real Indoor Scans ScanNet [94] '17 1.5K scans; 2.5M fr. RGB-DSmInst/- R ScanNet++ [95] '23 460 scenes laser+DSLRSmInstMat/TopoPBR R Matterport3D [96] '17 90 bldgs. panorama SmInst/Topo R HM3D [97] '21 1K buildings mesh SmNav/Nav R Scenes- Synthetic / Artist Structured3D [98] '20 3.5K houses render SmInstLy/TopoUV O Hypersim [99] '21 461 scenes; 77K img. renderSmInstMat/PBRTopo O 3D-FRONT [46] '21 18.8K rooms artist CADSmLy/TopoUV O Scenes- Procedural / Interactive ProcTHOR [14] '22 10K+ houses proceduralSmNavPh/NavPhysProc O"},{"citing_arxiv_id":"2604.22482","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Holo360D: A Large-Scale Real-World Dataset with Continuous Trajectories for Advancing Panoramic 3D Reconstruction and Beyond","primary_cat":"cs.CV","submitted_at":"2026-04-24T12:03:27+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17473","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Dual-Anchoring: Addressing State Drift in Vision-Language Navigation","primary_cat":"cs.CV","submitted_at":"2026-04-19T15:03:38+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"* indicates methods using the waypoint predictor from [19]. Thebestand second-best results are highlighted. Method Sensors R2R-CE RxR-CE Pano. Odo. Depth S-RGB NE↓OSR↑SR↑SPL↑NE↓SR↑SPL↑ HPN+DN* [23]✓ ✓ ✓6.31 40.0 36.0 34.0 - - - CMA* [18]✓ ✓ ✓6.20 52.0 41.0 36.0 8.76 26.5 22.1 Sim2Sim* [24]✓ ✓ ✓6.07 52.0 43.0 36.0 8.76 26.5 22.1 GridMM* [41]✓ ✓ ✓5.11 61.0 49.0 41.0 - - - DreamWalker* [39]✓ ✓ ✓5.53 59.0 49.0 44.0 - - - Reborn* [2]✓ ✓ ✓5.40 57.0 50.0 46.0 5.98 48.6 42.0 ETPNav* [1]✓ ✓ ✓4.71 65.0 57.0 49.0 5.64 54.7 44.8 HNR* [40]✓ ✓ ✓4.42 67.0 61.0 51.0 5.50 56.3 46.7 AG-CMTP [8]✓ ✓ ✓7.90 39.0 23.0 19.0 - - - R2R-CMTP [8]✓ ✓ ✓7.90 38.0 26.0 22.0 - - - Instruc-Nav [34]✓ ✓ ✓6.89 - 31.0 24.0 - - - LAW [38]✓ ✓ ✓6.83 44.0 35.0 31.0 10.90 8."},{"citing_arxiv_id":"2604.17407","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Think before Go: Hierarchical Reasoning for Image-goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-04-19T12:30:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HRNav decomposes image-goal navigation into VLM-based short-horizon planning and RL-based execution with a wandering suppression penalty to improve performance in complex unseen settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14025","ref_index":274,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Feed-Forward 3D Scene Modeling: A Problem-Driven Perspective","primary_cat":"cs.CV","submitted_at":"2026-04-15T16:07:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper proposes a problem-driven taxonomy for feed-forward 3D scene modeling that groups methods by five core challenges: feature enhancement, geometry awareness, model efficiency, augmentation strategies, and temporal-aware modeling.","context_count":1,"top_context_role":"dataset","top_context_polarity":"background","context_text":"ABO [267] 50K/cubeR - GS-LRM, LVSM OmniObject3D [268] 6,000/cubeS AGG MeshFormer Objaverse [269] 818K/cubeS VGGT, LGM, LRM LRM WildRGBD [270] 23,049/cubeR VGGT, AnySplat - NYUv2 [271] 464/h⌢meR - CATSplat, Flash3D, WorldMirror TUM RGBD [272] 39/h⌢meR - FLARE, LoRA3D, VGGT-SLAM 7Scenes [273] 7/h⌢meR - Dust3R, VGGT, Fast3R ScanNet [22] 1,513/h⌢meR VGGT DepthSplat, Uni3R Matterport3D [274] 90/h⌢meR - Convolutional Occupancy Networks Replica [23] 18/h⌢meR VGGT, SAIL-Recon MeshSplat, LoRA3D Habitat [275] 211/h⌢meS Dust3R, MASt3R, VGGT - HyperSim [276] 461/h⌢meS VGGT, AnySplat - ARKitScenes [277] 1,661/h⌢meR Dust3R, MASt3R PreF3R ScanNet++ [278] 1,006/h⌢meR Dust3R, MASt3R PreF3R, Uni3R Hot3D [279] 1.5M/h⌢meR 4DGT 4DGT Static Scenes 3D [280] 41K/treeR Mono3R -"},{"citing_arxiv_id":"2604.12872","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OVAL: Open-Vocabulary Augmented Memory Model for Lifelong Object Goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-04-14T15:22:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OVAL introduces an open-vocabulary memory model with structured descriptors and multi-value frontier scoring to enable efficient lifelong object goal navigation in unseen settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10789","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReplicateAnyScene: Zero-Shot Video-to-3D Composition via Textual-Visual-Spatial Alignment","primary_cat":"cs.CV","submitted_at":"2026-04-12T19:42:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReplicateAnyScene performs fully automated zero-shot video-to-compositional-3D reconstruction by cascading alignments of generic priors from vision foundation models across textual, visual, and spatial dimensions.","context_count":1,"top_context_role":"dataset","top_context_polarity":"background","context_text":"tion quality across diverse semantic, visual and geometric dimensions. Exten- sive experiments demonstrate that our method significantly outperforms existing baselines, setting a new standard for high-quality, fully automated compositional scene generation as shown in Fig. 1. 2 Related Works 2.1 3D Indoor Datasets Rapid advancements in 3D computer vision have driven the creation of numerous indoorscenedatasets[5,8,12,21,55,69].Typically,thesedatasetsprovidescanned meshes from real scenes along with semantic and instance segmentation. How- ever, their holistic representation inherently lacks compositional structure. This fundamental limitation hinders the further advancement of Embodied AI tasks that require object-level manipulation, such as robotic navigation [11,44] and"},{"citing_arxiv_id":"2604.08916","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MV3DIS: Multi-View Mask Matching via 3D Guides for Zero-Shot 3D Instance Segmentation","primary_cat":"cs.CV","submitted_at":"2026-04-10T03:26:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MV3DIS uses 3D-guided mask matching and depth consistency to produce more consistent multi-view 2D masks that refine into accurate zero-shot 3D instances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07296","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OpenSpatial: A Principled Data Engine for Empowering Spatial Intelligence","primary_cat":"cs.CL","submitted_at":"2026-04-08T17:03:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OpenSpatial supplies a principled open-source data engine and 3-million-sample dataset that raises spatial-reasoning model performance by an average of 19 percent on benchmarks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"consistency or measurement checks when permitted- encouraging the model to form a coherent 3D representation that generalizes across viewpoints. 3.2 Description of OpenSpatial-3M Dataset Data Source: Following VST [53], we leverage the meticulously annotated 3D bounding boxes from EmbodiedScan [44] as the foundational data for our pipeline, which aggregates scenes from ScanNet [15], Matterport3D [8], ARK- itScenes [4], and SUN-RGBD [38]. Notably, we excluded SUN-RGBD due to its relatively lower annotation fidelity. To further enhance environmental diversity, we incorporated pre-processed data from ScanNet++ [57] and Hypersim [35] as supplementary sources. Additionally, we collected a set of web data to fur- ther broaden the diversity of our data sources."},{"citing_arxiv_id":"2604.02389","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Audio Spatially-Guided Fusion for Audio-Visual Navigation","primary_cat":"cs.SD","submitted_at":"2026-04-02T07:15:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Audio Spatially-Guided Fusion improves generalization in audio-visual navigation on unheard sound sources by extracting spatial audio features and adaptively fusing them with visual data.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"ments to autonomously plan paths and reach sound source targets [13]. In recent years, existing studies have explored the audio-visual navigation task from different processing methods of application scenarios and observation informa- tion. SoundSpaces [6] builds the first simulation platform for audio-visual navigation tasks based on the Replica [7] and Matterport3D [14] real indoor scene datasets, providing a unified experimental environment for audio-visual navigation methods. LLA [15] adopts a phased perception and planning structure to enhance the stability during the policy learning process. A V-W AN [16] models observation information as sound and visual maps, achieving efficient navigation by de- composing long-term goals into short-term goals."},{"citing_arxiv_id":"2603.27105","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniDAC: Universal Metric Depth Estimation for Any Camera","primary_cat":"cs.CV","submitted_at":"2026-03-28T03:23:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniDAC achieves universal metric depth estimation across camera types by decoupling relative depth prediction from spatially varying scale estimation using a depth-guided module and distortion-aware positional embedding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.26788","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReMemNav: A Rethinking and Memory-Augmented Framework for Zero-Shot Object Navigation","primary_cat":"cs.RO","submitted_at":"2026-03-25T09:07:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReMemNav improves zero-shot object navigation success and efficiency by integrating episodic memory and rethinking with VLMs, achieving SR/SPL gains of 1.7%/7.0% on HM3D v0.1, 18.2%/11.1% on HM3D v0.2, and 8.7%/7.9% on MP3D.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.20530","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Memory Over Maps: 3D Object Localization Without Reconstruction","primary_cat":"cs.RO","submitted_at":"2026-03-20T21:57:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A map-free localization method stores posed RGB-D keyframes, retrieves and re-ranks them with a VLM, then fuses sparse depth for on-demand 3D target estimates, matching reconstruction-based performance on navigation benchmarks with far lower build cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.05467","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MerNav: A Highly Generalizable Memory-Execute-Review Framework for Zero-Shot Object Goal Navigation","primary_cat":"cs.CV","submitted_at":"2026-02-05T09:15:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MerNav's Memory-Execute-Review framework improves success rates in zero-shot object goal navigation by 5-8% over baselines on four datasets while outperforming both training-free and supervised methods on key benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.07447","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PanoSAMic: Panoramic Image Segmentation from SAM Feature Encoding and Dual View Fusion","primary_cat":"cs.CV","submitted_at":"2026-01-12T11:39:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PanoSAMic modifies SAM with multi-stage feature encoding, spatio-modal fusion, spherical attention, and dual-view fusion to achieve SOTA panoramic semantic segmentation on public RGB and RGB-D datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.21714","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AstraNav-World: World Model for Foresight Control and Consistency","primary_cat":"cs.CV","submitted_at":"2025-12-25T15:31:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AstraNav-World unifies diffusion video generation and vision-language action planning in a single bidirectional model that improves trajectory accuracy, success rates, and zero-shot real-world adaptation in embodied navigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.17817","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Chorus: Multi-Teacher Pretraining for Holistic 3D Gaussian Scene Encoding","primary_cat":"cs.CV","submitted_at":"2025-12-19T17:22:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Chorus pretrains a shared 3D Gaussian scene encoder via multi-teacher distillation to capture holistic features from high-level semantics to fine-grained structure, with strong transfer on segmentation and point-cloud tasks using far fewer scenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.11232","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DoReMi: Bridging 3D Domains via Topology-Aware Domain-Representation Mixture of Experts","primary_cat":"cs.CV","submitted_at":"2025-11-14T12:32:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DoReMi uses self-supervised pre-training on topological and texture variations plus domain-aware experts with spatial-guided routing and entropy-controlled allocation to reach 80.1% mIoU on ScanNet and 77.2% mIoU on S3DIS.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.04320","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MacroNav: Multi-Task Context Representation Learning Enables Efficient Navigation in Unknown Environments","primary_cat":"cs.RO","submitted_at":"2025-11-06T12:47:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MacroNav learns multi-scale navigation-centric representations through multi-task self-supervised learning and combines them with graph-based reinforcement learning for efficient action selection, reporting gains in success rate and path efficiency over prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.20685","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"C-NAV: Towards Self-Evolving Continual Object Navigation in Open World","primary_cat":"cs.RO","submitted_at":"2025-10-23T15:57:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"C-Nav is a continual visual navigation framework with dual-path anti-forgetting via feature distillation and replay plus adaptive sampling that outperforms baselines on a new continual object navigation benchmark while using less memory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}