{"total":87,"items":[{"citing_arxiv_id":"2605.23198","ref_index":59,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Label-Efficient Dataset Pruning via Semi-Supervised Pseudo-Labeling","primary_cat":"cs.LG","submitted_at":"2026-05-22T03:29:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SemiPrune uses a small labeled subset and semi-supervised pseudo-labeling to enable supervised dataset pruning methods, achieving state-of-the-art results on domain-specific, image-corrupted, and long-tailed datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23033","ref_index":64,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Uncovering the Latent Potential of Deep Intermediate Representations","primary_cat":"cs.LG","submitted_at":"2026-05-21T20:58:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces LOES, a constructive spectral method to select task-discriminative subspaces from intermediate layer embeddings, and GeoReg for enforcing simplicial class geometry during fine-tuning, with reported gains increasing with model depth across modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22894","ref_index":5,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SCRIPT: Scalable Diffusion Policy with Multi-stage Training for Language-driven Physics-based Humanoid Control","primary_cat":"cs.GR","submitted_at":"2026-05-21T14:17:21+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21882","ref_index":51,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Thermo-VL: Extending Vision-Language Models to Thermal Infrared Perception","primary_cat":"cs.CV","submitted_at":"2026-05-21T01:43:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Thermo-VL augments a frozen Molmo-7B VLM with a trainable thermal encoder and prompt-conditioned dual-attention fusion to improve cross-spectrum visual reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21479","ref_index":111,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"WikiVQABench: A Knowledge-Grounded Visual Question Answering Benchmark from Wikipedia and Wikidata","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:58:24+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WikiVQABench is a human-curated collection of Wikipedia-based VQA items that require both visual evidence and external knowledge from Wikidata to answer correctly.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21123","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Linear-DPO: Linear Direct Preference Optimization for Diffusion and Flow-Matching Generative Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T12:54:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Linear-DPO replaces sigmoid utility with linear utility and adds EMA reference to improve preference alignment in diffusion and flow-matching text-to-image models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20551","ref_index":15,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Faster or Stronger: Towards Flexible Visual Place Recognition via Weighted Aggregation and Token Pruning","primary_cat":"cs.CV","submitted_at":"2026-05-19T23:01:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes weighted aggregation of clusters and self-distillation-driven token pruning to improve both accuracy and efficiency in ViT-based visual place recognition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20390","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"STELLAR: Scaling 3D Perception Large Models for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-19T18:40:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"STELLAR trains up to 500M-parameter multi-modal models on 50M driving scenes and reports empirical scaling trends plus new state-of-the-art results on the Waymo Open Dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19366","ref_index":48,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Accurate, Efficient, and Explainable Deep Learning Approaches for Environmental Science Problems","primary_cat":"cs.LG","submitted_at":"2026-05-19T04:58:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The work introduces WaLeF/FIDLAr for flood forecasting, CoDiCast for probabilistic weather, and Hypercube-RAG for explainable environmental QA, claiming superior accuracy, efficiency, and interpretability over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19343","ref_index":101,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"What Makes a Representation Good for Single-Cell Perturbation Prediction?","primary_cat":"cs.LG","submitted_at":"2026-05-19T04:30:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PerturbedVAE disentangles perturbation-specific signals from invariant gene expression structure to recover causal representations and improve out-of-distribution prediction in single-cell perturbation modeling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19101","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Heterogeneity-Aware Dataset Scheduling for Efficient Audio Large Language Model Training","primary_cat":"cs.SD","submitted_at":"2026-05-18T20:41:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GST uses gradient-based affinity metrics to form dataset groups and applies progressive scheduling, achieving 30-40% faster convergence than uniform mixture training on 14 AudioQA datasets while matching or exceeding performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19075","ref_index":51,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"CRAFT: Critic-Refined Adaptive Key-Frame Targeting for Multimodal Video Question Answering","primary_cat":"cs.CV","submitted_at":"2026-05-18T20:01:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CRAFT introduces a query-conditioned pipeline with dynamic keyframe selection, ASR, and a hybrid critic loop that achieves top scores on MAGMaR 2026 for grounded multi-video question answering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18147","ref_index":187,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Foundation Models for Credit Risk Prediction: A Game Changer?","primary_cat":"cs.LG","submitted_at":"2026-05-18T09:52:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Tabular foundation models outperform standard methods in credit risk PD and LGD tasks, with larger gains on smaller datasets when used out-of-the-box.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18010","ref_index":84,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Functionalization via Structure Completion and Motion Rectification","primary_cat":"cs.CV","submitted_at":"2026-05-18T08:05:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Object functionalization is cast as neural graph completion over a functional graph of parts, contacts, and motions, followed by geometry realization that also rectifies erroneous motions, demonstrated on furniture with a new paired dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17990","ref_index":65,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Low Latency Gaze Tracking via Latent Optical Sensing","primary_cat":"cs.CV","submitted_at":"2026-05-18T07:46:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A hardware prototype performs gaze estimation by optically encoding task-relevant features with a microlens array and mask, captured on a 4x4 phototransistor array and decoded by a small neural network, reaching 3.4 ms latency with competitive accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17543","ref_index":3,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"HL-OutPaint: Coarse-to-Fine Video Outpainting for High-Resolution Long-Range Videos","primary_cat":"cs.CV","submitted_at":"2026-05-17T16:52:38+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17486","ref_index":12,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DyGRO-VLA: Cross-Task Scaling of Vision-Language-Action Models via Dynamic Grouped Residual Optimization","primary_cat":"cs.RO","submitted_at":"2026-05-17T14:55:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DyGRO-VLA is a two-stage optimization framework for cross-task scaling of Vision-Language-Action models via dynamic grouped residual optimization in RL.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"¯Qmin(s,a) = min j∈{1,...,K} ¯Q¯θj(s,a),(10) which yields the backup operatorB π ¯Qmin. h-step (chunk) Bellman regression.Each critic is trained by regressing to the shared target: LTD(θi) =E (st,at,r(h) t ,st+h)∼D h Qθi(st,a t)− B π ¯Qmin(st,a t) \u00012i .(11) The correspondingh-step Bellman backup is Bπ ¯Qmin(st,a t) =r (h) t +γ h Ea′∼π(·|st+h) \u0002 ¯Qmin(st+h,a ′) \u0003 ,(12) where theh-step return is r(h) t = h−1X i=0 γi rt+i.(13) Cal-QL calibration regularizer.To enable a smooth transition from offline data to online rollouts, we apply the Cal-QL calibration regularizer to each ensemble member, encouraging high value on policy actions while remaining anchored to dataset actions: LCalReg(θi) =E st∼D h Ea∼π(·|st) \u0002 max"},{"citing_arxiv_id":"2605.20237","ref_index":54,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AnimeAdapter: Fine-grained and Consistent Zero-shot Anime Character Generation","primary_cat":"cs.CV","submitted_at":"2026-05-17T07:40:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AnimeAdapter is a pretrained lightweight adapter for Stable Diffusion that uses semantic-selective local attention from CLIP and pose-aware conditioning to enable zero-shot fine-grained consistent anime character generation from a single reference image.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17228","ref_index":113,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Artificial Intolerance: Stigmatizing Language in Clinical Documentation Skews Large Language Model Decision-Making","primary_cat":"cs.CL","submitted_at":"2026-05-17T02:28:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Frontier LLMs exhibit bias from stigmatizing language in clinical vignettes across four conditions, skewing decisions toward less aggressive management, with limited mitigation from Chain-of-Thought or self-debiasing prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17118","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Differentiable Optimization Layers for Guaranteed Fairness in Deep Learning","primary_cat":"cs.LG","submitted_at":"2026-05-16T18:49:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a fairness layer for deep learning models that guarantees output parity and an online primal-dual algorithm for aggregate fairness guarantees in streaming predictions with small batch sizes.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"entropy loss with a squared pairwise group-gap penalty. Let si denote the demographic group label for observation i, let G denote the set of demographic groups appearing in the batch, and define Ig ={i:s i =g} . The augmented objective is then: Lpenalty =− 1 N NX i=1 [yi log(σ(zi)) + (1−y i) log(1−σ(z i))] +λ X g,h∈G g<h   1 |Ig| X i∈Ig zi − 1 |Ih| X i∈Ih zi   2 . (24) Here, zi is the raw logit output of the model for sample i, σ(·) denotes the sigmoid function, and λ >0 controls the strength of the fairness penalty. The penalty term is the sum of squared pairwise differences between average logits across demographic groups in the batch. As in the other experiments, the Penalty method selects λ from a candidate"},{"citing_arxiv_id":"2605.17031","ref_index":6,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"A Joint Synthetic Housing-Household Inventory","primary_cat":"cs.CY","submitted_at":"2026-05-16T14:58:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A framework integrates synthetic population generation from ACS PUMS, deep contrastive learning for housing-household compatibility, and hierarchical optimization to produce a joint inventory that matches block-group demographics and spatial patterns in coastal North Carolina.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16638","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"TTE-Flash: Accelerating Reasoning-based Multimodal Representations via Think-Then-Embed Tokens","primary_cat":"cs.AI","submitted_at":"2026-05-15T21:10:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TTE-Flash trains latent think tokens with CoT generation loss and embedding tokens with contrastive loss to deliver high-performance multimodal representations without generating explicit reasoning at inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14422","ref_index":44,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"What if Tomorrow is the World Cup Final? Counterfactual Time Series Forecasting with Textual Conditions","primary_cat":"cs.LG","submitted_at":"2026-05-14T06:10:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the task of counterfactual time series forecasting with textual conditions plus a text-attribution mechanism that improves accuracy by distinguishing mutable from immutable factors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14270","ref_index":45,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Diagnosing and Correcting Concept Omission in Multimodal Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-14T02:14:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14091","ref_index":132,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Venus-DeFakerOne: Unified Fake Image Detection & Localization","primary_cat":"cs.CV","submitted_at":"2026-05-13T20:20:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14075","ref_index":12,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Rethinking Layer Relevance in Large Language Models Beyond Cosine Similarity","primary_cat":"cs.LG","submitted_at":"2026-05-13T19:51:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Cosine similarity poorly predicts performance degradation from layer removal in LLMs, making direct accuracy-drop ablation a more reliable relevance metric.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13838","ref_index":89,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"R-DMesh: Video-Guided 3D Animation via Rectified Dynamic Mesh Flow","primary_cat":"cs.CV","submitted_at":"2026-05-13T17:58:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"R-DMesh generates high-fidelity 4D meshes aligned to video by disentangling base mesh, motion, and a learned rectification jump offset inside a VAE, then using Triflow Attention and rectified-flow diffusion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13943","ref_index":36,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"A Unified Geometric Framework for Weighted Contrastive Learning","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:48:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Weighted InfoNCE objectives realize specific target geometries in embedding space, with SupCon producing size-dependent inter-class similarities under imbalance while Soft SupCon and certain continuous variants preserve regular simplex or unique optima.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13651","ref_index":12,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"NAACA: Training-Free NeuroAuditory Attentive Cognitive Architecture with Oscillatory Working Memory for Salience-Driven Attention Gating","primary_cat":"cs.SD","submitted_at":"2026-05-13T15:09:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NAACA uses a neuro-inspired oscillatory working memory to gate attention in audio language models, raising AudioQwen's average precision from 53.5% to 70.6% on XD-Violence while cutting unnecessary calls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12939","ref_index":41,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DirectTryOn: One-Step Virtual Try-On via Straightened Conditional Transport","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:18:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DirectTryOn achieves state-of-the-art one-step virtual try-on performance by applying pure conditional transport, garment preservation loss, and self-consistency loss to straighten trajectories in pretrained generative models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16399","ref_index":46,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Stable and Near-Reversible Diffusion ODE Solvers for Image Editing","primary_cat":"cs.CV","submitted_at":"2026-05-12T18:34:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Near-reversible Runge-Kutta diffusion ODE solvers with vector-field smoothing improve stability and edit fidelity for large changes in text-guided image editing compared to exactly reversible alternatives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12485","ref_index":54,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Letting the neural code speak: Automated characterization of monkey visual neurons through human language","primary_cat":"q-bio.NC","submitted_at":"2026-05-12T17:58:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Natural language descriptions generated via a closed-loop pipeline with digital twins capture the selectivity of most neurons in macaque V1 and V4, with synthesized images driving 96% of V4 neurons into the top or bottom 5% of natural-image response distributions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16384","ref_index":45,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Mutual Enhancement Between Global Tokens and Patch Tokens: From Theory to Practice","primary_cat":"cs.CV","submitted_at":"2026-05-11T10:51:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TaTok is a theoretically grounded adaptive tokenization method that uses global tokens and cumulative conditional entropy filtering to reduce redundancy while improving reconstruction quality over fixed-rate patch tokenization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10253","ref_index":41,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Knowledge Poisoning Attacks on Medical Multi-Modal Retrieval-Augmented Generation","primary_cat":"cs.CR","submitted_at":"2026-05-11T09:22:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"M³Att poisons medical multimodal RAG by pairing covert textual misinformation with query-agnostic visual perturbations that increase retrieval of the bad content, causing LLMs to generate clinically plausible but incorrect responses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10118","ref_index":21,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Plan in Sandbox, Navigate in Open Worlds: Learning Physics-Grounded Abstracted Experience for Embodied Navigation","primary_cat":"cs.RO","submitted_at":"2026-05-11T07:34:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SAGE trains agents in physics-grounded semantic abstractions via RL with asymmetric clipping, achieving 53.21% LLM-Match Success on A-EQA (+9.7% over baseline) and encouraging physical robot transfer.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"For augmented samples with positive advantage ( ˆAi,t >0 ), If ϵexp → ∞ is permitted, monotonic policy improvement cannot be guaranteed. Proof.When the advantage is positive, the objective seeks to increase ρi,t(θ)>1 . For Standard Samples (mi = 0), the upper bound is1 +ϵ std. For Augmented Samples (mi = 1), the upper bound is relaxed to1 +ϵ exp (whereϵ exp > ϵ std): LCLIP aug (θ) = min(ρi,t(θ) ˆAi,t,(1 +ϵ exp) ˆAi,t).(21) By relaxing the upper bound, we allow the policy to take larger gradient steps toward the retrieved trajectory. Consider the SAGE objective (Eq. 11) for an augmented sample with ˆAi,t >0. If we setϵ exp → ∞, the objective simplifies to: J(θ) =ρ i,t(θ) ˆAi,t −βD KL ,(22) 19 Plan in Sandbox, Navigate in Open Worlds: Learning Physics-Grounded Abstracted Experience for Embodied Navigation"},{"citing_arxiv_id":"2605.08574","ref_index":48,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Post-hoc Selective Classification for Reliable Synthetic Image Detection","primary_cat":"cs.CV","submitted_at":"2026-05-09T00:25:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReSIDe generalizes logit-based confidence scores to intermediate layers of synthetic image detectors and uses preference optimization to aggregate them, cutting area under the risk-coverage curve by up to 69.55% under covariate shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08421","ref_index":15,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Beyond Bag-of-Patches: Learning Global Layout via Textual Supervision for Late-Interaction Visual Document Retrieval","primary_cat":"cs.CV","submitted_at":"2026-05-08T19:28:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A text-supervised global layout embedding augments local patch representations in late-interaction VDR, yielding +2.4 nDCG@5 and +2.3 MAP@5 gains over ColPali/ColQwen baselines on ViDoRe-v2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16355","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Generative 3D Gaussians with Learned Density Control","primary_cat":"cs.GR","submitted_at":"2026-05-08T17:54:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeG models 3D Gaussians via learned octree density and uses VecSeq Sobol re-indexing to turn set generation into sequence modeling, claiming SOTA quality in single-image-to-3D.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07786","ref_index":10,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"APEX: Assumption-free Projection-based Embedding eXamination Metric for Image Quality Assessment","primary_cat":"cs.CV","submitted_at":"2026-05-08T14:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"APEX is an assumption-free image quality metric using Sliced Wasserstein Distance on CLIP and DINOv2 embeddings that claims superior robustness to degradations and cross-dataset stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07381","ref_index":82,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Escaping the Diversity Trap in Robotic Manipulation via Anchor-Centric Adaptation","primary_cat":"cs.RO","submitted_at":"2026-05-08T07:35:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anchor-Centric Adaptation escapes the diversity trap by prioritizing repeated demonstrations at core anchors over broad coverage, yielding higher success rates under fixed data budgets in robotic manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07140","ref_index":52,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Neurosymbolic Framework for Concept-Driven Logical Reasoning in Skeleton-Based Human Action Recognition","primary_cat":"cs.CV","submitted_at":"2026-05-08T02:20:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Neurosymbolic framework grounds skeleton motion in learnable pose and dynamics concepts then reasons over them with differentiable logic to recognize actions interpretably on NTU and NW-UCLA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06797","ref_index":2,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MIND: Monge Inception Distance for Generative Models Evaluation","primary_cat":"cs.LG","submitted_at":"2026-05-07T18:00:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MIND uses sliced Wasserstein distance on Inception features to evaluate generative models, matching FID performance with 10x fewer samples and 100x faster computation while being more robust to moment-matching attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05776","ref_index":95,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"HEDP: A Hybrid Energy-Distance Prompt-based Framework for Domain Incremental Learning","primary_cat":"cs.AI","submitted_at":"2026-05-07T07:09:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HEDP uses energy regularization inspired by Helmholtz free energy plus hybrid energy-distance weighting in prompts to improve domain selection and achieve a 2.57% accuracy gain on benchmarks like CORe50 while mitigating catastrophic forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05756","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MaMi-HOI: Harmonizing Global Kinematics and Local Geometry for Human-Object Interaction Generation","primary_cat":"cs.RO","submitted_at":"2026-05-07T06:52:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MaMi-HOI counters geometric forgetting in diffusion models via a Geometry-Aware Proximity Adapter for precise contacts and a Kinematic Harmony Adapter for natural whole-body postures in human-object interactions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05668","ref_index":2,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Large Vision-Language Models Get Lost in Attention","primary_cat":"cs.AI","submitted_at":"2026-05-07T04:45:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"In LVLMs, attention can be replaced by random Gaussian weights with little or no performance loss, indicating that current models get lost in attention rather than efficiently using visual context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05646","ref_index":12,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MUSE: Resolving Manifold Misalignment in Visual Tokenization via Topological Orthogonality","primary_cat":"cs.CV","submitted_at":"2026-05-07T03:53:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MUSE decouples reconstruction and semantic learning in visual tokenization via topological orthogonality, yielding SOTA generation quality and improved semantic performance over its teacher model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05573","ref_index":14,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AstroAlertBench: Evaluating the Accuracy, Reasoning, and Honesty of Multimodal LLMs in Astronomical Classification","primary_cat":"astro-ph.IM","submitted_at":"2026-05-07T01:36:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AstroAlertBench evaluates multimodal LLMs on astronomical classification accuracy, reasoning, and honesty using real ZTF alerts, revealing that high accuracy often diverges from self-assessed reasoning quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04777","ref_index":67,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Bridging Perception and Action: A Lightweight Multimodal Meta-Planner Framework for Robust Earth Observation Agents","primary_cat":"cs.MA","submitted_at":"2026-05-06T11:30:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The LMMP framework improves tool-calling accuracy and task success rates for Earth observation agents by grounding plans in multimodal features and remote sensing expert knowledge via a two-stage training process.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04653","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Threshold-Guided Optimization for Visual Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-06T08:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A threshold-guided alignment method lets visual generative models be optimized directly from scalar human ratings instead of requiring paired preference data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04309","ref_index":28,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Interpreting V1 Population Activity via Image-Neural Latent Representation Alignment","primary_cat":"cs.NE","submitted_at":"2026-05-05T21:15:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DINA is a dual-tower contrastive model that aligns images with mouse V1 neural activity to enable decoding and shows that low-level visual structure, not semantics or fine details, primarily supports the alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}