{"total":242,"items":[{"citing_arxiv_id":"2606.28266","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RSICCLLM: A Multimodal Large Language Model for Remote Sensing Image Change Captioning","primary_cat":"cs.CV","submitted_at":"2026-06-26T16:57:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RSICCLLM introduces a post-training framework with RSICI dataset, difference-aware supervised fine-tuning, and dual-negative preference optimization that claims to outperform much larger models on remote sensing image change captioning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28094","ref_index":14,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OSOR: One-Step Diffusion Inpainting for Effect-Aware Object Removal","primary_cat":"cs.CV","submitted_at":"2026-06-26T13:53:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OSOR is a one-step diffusion inpainting method using an occupancy-guided discriminator, alpha head, and semantic-anchored verification pipeline to achieve effect-aware object removal, outperforming multi-step baselines in quality at 4-30x speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27862","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ScaLe-INR: Scale and Learn Implicit Neural Representations","primary_cat":"cs.CV","submitted_at":"2026-06-26T09:02:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ScaLe-INR is a multi-branch INR architecture that applies directional scaling per the Fourier inverse theorem and a directional edge guidance loss to disentangle scales and improve reconstruction fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27784","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Improving Adversarial Robustness via Activation Amplification and Attenuation","primary_cat":"cs.CV","submitted_at":"2026-06-26T07:13:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A3 is a learnable activation scaling module that trains on amplified adversarial signals via contrastive losses to improve robustness when the same parameters are used in attenuation mode.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27738","ref_index":3,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HandMade: Spatial Prompting for Generative 3D Creation with Part-Labeled VR Sketches","primary_cat":"cs.HC","submitted_at":"2026-06-26T05:37:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HandMade converts segmented VR strokes into multi-view part guidance and structured prompts so generative 3D models better preserve user-specified spatial scaffolds than text-only or sketch baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27187","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HarmVideoBench: Benchmarking Harmful Video Understanding in Large Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-06-25T15:50:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HarmVideoBench is a multi-layered benchmark for harmful video understanding in LVLMs with three hierarchical dimensions, and BCR is a method that raises average model performance from 61.7% to 84.4%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26529","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Inattentional Gap: Task-Conditioned Language and Vision Models Omit the Safety-Critical Signals They Can Otherwise Report","primary_cat":"cs.CL","submitted_at":"2026-06-25T02:09:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Task conditioning suppresses safety-critical signal reporting in language and vision models that unconstrained versions report at higher rates, creating an inattentional gap that decouples benchmark safety from real-world safety.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25437","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LinStereo: Linear-Complexity Global Attention for Multi-Scale Iterative Stereo Matching","primary_cat":"cs.CV","submitted_at":"2026-06-24T05:58:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LinStereo uses Position-Aware Linear Attention, Hierarchical Semantic Cost Volumes, and Depth Prior Initialization to enable global aggregation in iterative stereo matching at linear complexity, showing improved performance on standard and underwater benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24628","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ArtiTwinSplat: Interactable Digital Twin Reconstruction via Gaussian Splatting from RGB-D videos","primary_cat":"cs.RO","submitted_at":"2026-06-23T14:24:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ArtiTwinSplat creates interactable digital twins of articulated objects from RGB-D videos via Gaussian Splatting and automatic part and joint discovery.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24449","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SENTRY: SAM2-Enhanced Neighbor-Aware and Temporally Reasoned Memory for Visual Tracking","primary_cat":"cs.CV","submitted_at":"2026-06-23T11:35:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SENTRY is a plug-and-play module that replaces confidence-based memory writes with neighbor-aware cycle-consistent validation in SAM2 trackers, yielding new zero-shot SOTA results on LaSOT, GOT-10k and other benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24375","ref_index":9,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MATCH: Flow Matching for Multi-View Anomaly Detection","primary_cat":"cs.CV","submitted_at":"2026-06-23T10:07:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MATCH is the first flow matching method for multi-view anomaly detection, reporting SOTA results on Real-IAD and the first comprehensive evaluation on MANTA-Tiny while enabling real-time use by omitting the divergence term.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23964","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"3D Masked Autoencoders are Robust Learners of Volumetric and Multimodal Cellular Representations for Microscopy","primary_cat":"cs.LG","submitted_at":"2026-06-22T21:45:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"3D masked autoencoders with multimodal alignment to ESM2 outperform 2D variants on single-cell microscopy tasks, reaching ROC-AUC 0.865 on protein-protein interaction and state-of-the-art AUC_micro 0.952 on localization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23843","ref_index":35,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HANCLIP: A Family of Hyperbolic Angular Negation Vision Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-22T18:25:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HANCLIP restructures VLM embeddings with hyperbolic space and angular negation objectives to raise negation sensitivity on NegBench while keeping standard retrieval and classification performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23669","ref_index":6,"ref_count":4,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GeoFidelity-Bench: Evaluating Segment-Level Geographic Fidelity in Text-to-Image Street-View Generation","primary_cat":"cs.CV","submitted_at":"2026-06-22T17:53:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GeoFidelity-Bench shows text-to-image models gain city-level plausibility from local names but achieve near-zero improvement in exact segment identity, with GPS coordinates adding no benefit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23514","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Arbor: Explicit Geometric Conditioning for Controllable 3D Asset Generation","primary_cat":"cs.CV","submitted_at":"2026-06-22T16:00:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Arbor attaches constraint mesh tokens to a frozen text-to-3D denoiser to enable controllable generation obeying hull, avoidance, and touch constraints.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23312","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Pixels to Concepts: Growing Rich 3D Semantic Scene Graph Forests utilizing Foundation Models","primary_cat":"cs.RO","submitted_at":"2026-06-22T13:26:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Uses VLMs to detect instance concepts and LLMs to infer abstract relationships, assembling them into 3D scene graph forests that are evaluated on uHumans2 and ScanNet and tested in open-vocabulary retrieval on a Spot robot.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22945","ref_index":28,"ref_count":4,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Controllable Texture Tiling with Transformed RoPE-Enhanced Diffusion Models","primary_cat":"cs.GR","submitted_at":"2026-06-22T07:24:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A Diffusion Transformer framework applies coordinate-transformed RoPE and disjoint attention masks to achieve controllable, high-fidelity texture tiling that preserves reference structure and scene lighting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22725","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Interpretable Uncertainty Routing Separating Emotion Ambiguity from Distribution Shift in Facial Expression Recognition","primary_cat":"cs.CV","submitted_at":"2026-06-21T23:54:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Uncertainty decomposition via deep ensembles separates annotator disagreement from distribution shift in FER, enabling a routing mechanism that retains 1.8x more ambiguous faces at matched OOD rejection compared to single-uncertainty baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22648","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Leveraging target dynamics for imaging in complex media","primary_cat":"physics.optics","submitted_at":"2026-06-21T19:31:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Target dynamics provide an intrinsic source of variation equivalent to controlled illumination changes, enabling scattering-compensated reconstruction of dynamic scenes with one acquisition per frame in holographic and fluorescence imaging.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22631","ref_index":14,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"4DVLT: Dynamic Scene Understanding with Worldline-Centered Vision-Language Tracking","primary_cat":"cs.CV","submitted_at":"2026-06-21T18:33:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper defines the 4DVLT task for worldline-centered 4D scene understanding, releases Instruct-4D with 129.4K QA pairs, and presents 4DTrack achieving 62.68 TGA_Top1, outperforming adapted baselines by 19.62 points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22481","ref_index":236,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lighting-Consistent Object Transfer Across Radiance Fields","primary_cat":"cs.GR","submitted_at":"2026-06-21T12:50:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Diffusion-based per-view harmonization for lighting-consistent object transfer between 3DGS scenes, using heterogeneous training data and final 3D consolidation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21373","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FLM-Occ: Feed-forward Likelihood Maximization for Efficient Indoor Occupancy Prediction","primary_cat":"cs.CV","submitted_at":"2026-06-19T12:26:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FLM-Occ reformulates indoor occupancy prediction as feed-forward likelihood maximization over a mixture model with volume-normalized weights, achieving superior accuracy on Occ-ScanNet using only 32 superquadrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21309","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"WildBox: A Dataset and Benchmark for Aerial Monocular 3D Detection of African Savanna Wildlife","primary_cat":"cs.CV","submitted_at":"2026-06-19T10:45:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"WildBox provides over 237k 3D wildlife annotations from drone video and benchmarks reveal zero-shot 3D detection at 0 AP but fine-tuned performance of 8.68 AP-BEV and 13.17 AP3D, with depth estimation causing most errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21174","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HERO: Hypothesis-Driven Evidence Retrieval from Omics for Multi-Task Breast Cancer Analysis","primary_cat":"cs.CV","submitted_at":"2026-06-19T07:28:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HERO maps DNA methylation and miRNA to a 16-dimensional intent vector for TF-IDF caption retrieval and cosine-gated repair in VLM-based multi-task breast cancer prediction, claiming SOTA on TCGA-BRCA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20527","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StylisticBias: A Few Human Visual Cues Drive Most Social Biases in MLLMs","primary_cat":"cs.CL","submitted_at":"2026-06-18T17:39:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StylisticBias benchmark shows 15 visual attributes explain nearly 80% of bias variation in six MLLMs by isolating single cues like age and fashion in generated images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20419","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spectral Query-Key Product Weight Steering for Training-Free VLM Hallucination Mitigation","primary_cat":"cs.CV","submitted_at":"2026-06-18T16:03:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QK Product Steering suppresses dominant singular modes in the per-head QK product of selected middle layers via a closed-form query-only update, yielding 4.0% average relative CHAIR_s reduction on three GQA VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20312","ref_index":29,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reliability-Aware Prototype Calibration for Frozen Pose-Flow Video Anomaly Detection","primary_cat":"cs.CV","submitted_at":"2026-06-18T14:46:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RPC is a post-hoc calibration technique that augments flow-based anomaly scores with nearest-prototype deviation in the frozen latent space, gated by keypoint confidence, yielding consistent AUROC gains on video anomaly detection tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20295","ref_index":69,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Token-Operations-Oriented Inference Optimization Techniques for Large Models","primary_cat":"cs.SE","submitted_at":"2026-06-18T14:33:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper introduces a four-layer technical architecture for token-operations-oriented inference optimization in large models and reviews key technologies and industry status at each layer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23717","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SpaCE: Rethinking Spatial Capacity and Generalization in Multi-Frame Multimodal Large Language Models","primary_cat":"eess.IV","submitted_at":"2026-06-16T20:47:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SpaCE derives four theoretical results on spatial capacity, sample complexity, generalization, and bias-variance trade-offs for multi-frame MLLM reasoning, validated on MultiSPA, CA-VQA, and SpatialRGPT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17874","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Revisiting Structural Dependency in Autoregressive Multi-Task Table Recognition via Order-Independent Cell-Level Representations","primary_cat":"cs.CV","submitted_at":"2026-06-16T12:45:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces a non-causal attention refinement module to remove order dependence from cell representations in autoregressive table recognition models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17871","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StepGuard: Guarding Web Navigation via Single-Step Calibration","primary_cat":"cs.AI","submitted_at":"2026-06-16T12:42:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"StepGuard framework with DDPO and CANR claims SOTA navigation and answer accuracy on web benchmarks by switching policies and triggering reflection on low-confidence steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17713","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Heterogeneous SAR-optical fusion for near-real-time land use and land cover mapping under cloud contamination: A novel framework and global benchmark dataset","primary_cat":"cs.CV","submitted_at":"2026-06-16T09:25:10+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CloudLULC-Net is an end-to-end heterogeneous SAR-optical fusion network for LULC mapping under cloud contamination that achieves 86.60% OA, 83.29% F1, and 73.51% mIoU on a new global benchmark of 40,223 samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17446","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AnnotateAnything: Automatic Annotation of 3D Assets for Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-16T03:00:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AnnotateAnything converts passive 3D assets into manipulation-ready assets by combining vision-language reasoning for semantics with parallel physics pipelines for executable action annotations such as grasps and articulations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17334","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FATE: Pillar Encoding and Frequency-Aware Training for Event-Based Object Detection","primary_cat":"cs.CV","submitted_at":"2026-06-15T22:32:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FATE combines pillar encoding via orthogonal polynomial basis with frequency-aware training to enable event-based object detection at up to 200 Hz without internal temporal sub-binning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17200","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ACE-Ego-0: Unifying Egocentric Human and Robotic Data for VLA Pretraining","primary_cat":"cs.RO","submitted_at":"2026-06-15T18:40:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ACE-Ego-0 is a VLA pretraining framework that turns egocentric human videos into robot-format pseudo-actions via a video-to-action pipeline and trains jointly with robot data under a reliability-aware objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13312","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MagPlus: Bridging Micro-to-Regular Facial Expressions through Learnable Magnification","primary_cat":"cs.CV","submitted_at":"2026-06-11T13:08:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MagPlus learns magnification of micro-expressions to leverage pretrained macro-expression models for micro-expression tasks, with DeMagPlus restoring original intensity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13191","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Geometry of Phase Transitions in Generative Dynamics via Projection Caustics","primary_cat":"cs.LG","submitted_at":"2026-06-11T10:59:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper links phase-transition behavior in continuous generative samplers to projection caustics in the data geometry and introduces the Critical Boundary Detector as a diagnostic tool.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12759","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sparse2Act: Learning Action-Aligned Sparse 3D Representations for Cross-Domain Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-10T23:56:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sparse2Act pretrains sparse 3D encoders via masked action-alignment supervision, yielding reusable representations that reach 86.9% success on LIBERO-10 and enable cross-domain transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12207","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Intelligent Automation for Embodied Benchmark Construction: Pipelines, Embodiments, Simulators, and Trends","primary_cat":"cs.RO","submitted_at":"2026-06-10T15:25:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Automation in embodied benchmark construction shifts costs from acquisition toward validation, auditability, version control, and long-term governance instead of simply lowering total cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12153","ref_index":23,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TopoCap: Learning Topology-Agnostic Motion Priors for Monocular Video-to-Animation","primary_cat":"cs.CV","submitted_at":"2026-06-10T14:41:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A two-stage generative model (Graph CVAE + flow matching) learns topology-agnostic motion codes from a new 5k-topology dataset and retargets video motion to arbitrary unseen skeletons.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11841","ref_index":25,"ref_count":3,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scene-Adaptive Nonlinear Tone Curves for Pseudo Ground-Truth Generation in Low-Light 3D Gaussian Splatting","primary_cat":"cs.CV","submitted_at":"2026-06-10T09:20:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scene-adaptive nonlinear tone curves (ASE and AP3) with percentile normalisation and offset outperform linear gain for pseudo-GT generation in low-light 3DGS, delivering PSNR gains up to 4.34 dB on LOM and 3.25 dB on RealX3D across 21 scenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11782","ref_index":19,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Seeing What Matters: Perceptual Wrapper with Common Randomness for 3D Gaussian Splatting","primary_cat":"cs.CV","submitted_at":"2026-06-10T08:14:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A plug-and-play perceptual wrapper using common random noise and Wasserstein Distortion supervision improves texture quality and reduces model size in 3D Gaussian Splatting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10595","ref_index":100,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Data Heterogeneity to Convergence: A Data-Centric Review of Federated Learning","primary_cat":"cs.CR","submitted_at":"2026-06-09T09:00:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A data-centric survey of federated learning that ranks non-IID data traits by influence on convergence, links splitting protocols to real phenomena, and examines data-related defenses under clean and adversarial conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10468","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Geometric Coastline Localization using Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-09T06:37:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoastlineVLM-7B, a 7B VLM fine-tuned from LLaVA/GeoChat, jointly detects coastline presence, classifies proxies, and outputs polylines, reducing Hausdorff distance to 31.84 m and EMD to 17.32 m versus segmentation baselines on NZCCD.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10196","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Fisher-Guided Progressive Parameter Selection for Adaptive Fine-Tuning","primary_cat":"cs.CV","submitted_at":"2026-06-08T21:35:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FisherAdapTune uses temporal drift in Fisher geometry, measured by scale-invariant Jensen-Shannon distance, to progressively freeze stabilized parameter groups during fine-tuning, reporting gains on segmentation and zero-shot transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08744","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MB-Loc: Multi-planar Bird's-eye-view Localization in outdoor LiDAR scenes","primary_cat":"cs.CV","submitted_at":"2026-06-07T17:25:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MB-Loc projects LiDAR point clouds into multi-planar BEV images, applies 2D CNNs with a KL-regularized latent bottleneck and 3D augmentations, and reports real-time state-of-the-art localization accuracy on the NCLT dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08615","ref_index":6,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Harnessing Streaming Video in the Wild","primary_cat":"cs.CV","submitted_at":"2026-06-07T13:00:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents Streaming-Train-248K dataset, Streaming Harness system, and Streaming-Eval benchmark to enable VLMs for proactive, memory-equipped streaming video understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08169","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CLASP: Language-Driven Robot Skill Selection and Composition using Task-Parameterized Learning","primary_cat":"cs.RO","submitted_at":"2026-06-06T13:33:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CLASP combines TP-KMPs with VLMs for language-guided skill selection, covariance-weighted composition, and active learning requests, reporting 73.3-100% success on a 7-DoF manipulator.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08152","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Vision-Guided Dual-Arm Humanoid Robotic Disassembly of End-of-Life 18650 Lithium-ion Battery Packs","primary_cat":"cs.RO","submitted_at":"2026-06-06T13:07:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Vision-guided dual-arm robotic pipeline achieves 8/10 success disassembling 21-cell 18650 packs from arbitrary poses with 2.4 mm localization error and 6-minute cycle time using RGB-D sensing and general grippers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08002","ref_index":20,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aqua Boundary-Saliency Attention Module for Lightweight Underwater Salient Instance Segmentation Detection Transformer","primary_cat":"cs.CV","submitted_at":"2026-06-06T06:43:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LUSIS-DETR with AquaBSAM reports leading performance on four underwater instance segmentation datasets and real-time FP16 inference on an NVIDIA T4 GPU.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}