{"total":38,"items":[{"citing_arxiv_id":"2605.11563","ref_index":70,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TCP-SSM: Efficient Vision State Space Models with Token-Conditioned Poles","primary_cat":"cs.CV","submitted_at":"2026-05-12T05:49:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TCP-SSM conditions stable poles on visual tokens to explicitly control memory decay and oscillation in SSMs, cutting computation up to 44% while matching or exceeding accuracy on classification, segmentation, and detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11526","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient and provably convergent end-to-end training of deep neural networks with linear constraints","primary_cat":"math.OC","submitted_at":"2026-05-12T04:51:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An efficiently computable HS-Jacobian acts as a conservative mapping for projections onto polyhedral sets, supporting provably convergent Adam-based end-to-end training of linearly constrained deep neural networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11383","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HamBR: Active Decision Boundary Restoration Based on Hamiltonian Dynamics for Learning with Noisy Labels","primary_cat":"cs.CV","submitted_at":"2026-05-12T01:14:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HamBR uses Spherical HMC to probe ambiguous regions and synthesize virtual outliers with energy-based repulsion to restore decision boundaries degraded by noisy labels, achieving SOTA on CIFAR and real-world benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11231","ref_index":56,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LiBaGS: Lightweight Boundary Gap Synthesis for Targeted Synthetic Data Selection","primary_cat":"cs.LG","submitted_at":"2026-05-11T20:46:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiBaGS scores and selects synthetic data near decision boundaries using proximity, uncertainty, density, and validity, with boundary-gap allocation and marginal stopping to improve training accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08839","ref_index":73,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cross-Sample Relational Fusion: Unifying Domain Generalization and Class-Incremental Learning","primary_cat":"cs.CV","submitted_at":"2026-05-09T09:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CORF unifies domain generalization and class-incremental learning via selective sample refinement with spatial maps and confidence weighting plus cascaded relational distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08663","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CAST: Channel-Aware Spatial Transfer Learning with Pseudo-Image Radar for Sign Language Recognition","primary_cat":"cs.CV","submitted_at":"2026-05-09T04:02:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CAST achieves 80.5% Top-1 accuracy on radar-only sign language recognition by fusing physics-aware CVD and RTM representations through channel-aware spatial attention and asymmetric cross-attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08448","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLM-guided Semi-Supervised Approaches for Social Media Crisis Data Classification","primary_cat":"cs.AI","submitted_at":"2026-05-08T20:15:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LG-CoTrain, an LLM-guided co-training method, outperforms classical semi-supervised baselines for crisis tweet classification in low-resource settings with 5-25 labeled examples per class.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07816","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ICDAR 2026 Competition on Writer Identification and Pen Classification from Hand-Drawn Circles","primary_cat":"cs.CV","submitted_at":"2026-05-08T14:48:57+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new dataset of hand-drawn circles from 66 writers and 8 pens yields competition results of 64.8% top-1 accuracy for open-set writer identification and 92.7% for pen classification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06809","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LookWhen? Fast Video Recognition by Learning When, Where, and What to Compute","primary_cat":"cs.CV","submitted_at":"2026-05-07T18:08:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LookWhen factorizes video recognition into learning when, where, and what to compute via uniqueness-based token selection and dual-teacher distillation, achieving better accuracy-FLOPs trade-offs than baselines on multiple datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06522","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Agentic AIs Are the Missing Paradigm for Out-of-Distribution Generalization in Foundation Models","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:29:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Agentic AI systems are required to overcome the parameter coverage ceiling that prevents foundation models from handling certain out-of-distribution cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06043","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Domain Generalization through Spatial Relation Induction over Visual Primitives","primary_cat":"cs.CV","submitted_at":"2026-05-07T11:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PARSE improves domain generalization accuracy by factoring recognition into visual primitives and their spatial relational compositions learned end-to-end with differentiable predicates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05627","ref_index":81,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Leveraging Image Generators to Address Training Data Scarcity: The Gen4Regen Dataset for Forest Regeneration Mapping","primary_cat":"cs.CV","submitted_at":"2026-05-07T03:28:56+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mixing real UAV imagery with 2101 AI-generated image-mask pairs improves semantic segmentation F1 scores for fine-grained forest species by over 15 percentage points overall and up to 30 points for rare classes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04445","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LEGO: LoRA-Enabled Generator-Oriented Framework for Synthetic Image Detection","primary_cat":"cs.CV","submitted_at":"2026-05-06T03:21:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LEGO uses multiple generator-specific LoRA modules modulated by an MLP and fused with attention to detect synthetic images, achieving better performance than prior methods while using under 10% of the training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02094","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SignMAE: Segmentation-Driven Self-Supervised Learning for Sign Language Recognition","primary_cat":"cs.CV","submitted_at":"2026-05-03T23:25:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SignMAE uses segmentation-driven masking in a mask-and-reconstruct self-supervised task to learn fine-grained sign representations, achieving state-of-the-art accuracy on WLASL, NMFs-CSL, and Slovo with fewer frames and modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27903","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HiMix: Hierarchical Artifact-aware Mixup for Generalized Synthetic Image Detection","primary_cat":"cs.CV","submitted_at":"2026-04-30T14:12:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HiMix combines mixup augmentation to create transitional real-fake samples with hierarchical global-local artifact feature fusion to achieve better generalization in detecting AI-generated images from unseen generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26301","ref_index":78,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cheeger--Hodge Contrastive Learning for Structurally Robust Graph Representation Learning","primary_cat":"cs.LG","submitted_at":"2026-04-29T05:04:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CHCL aligns a Cheeger-Hodge joint signature across graph augmentations to produce embeddings that remain stable under local structural changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21311","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"an interpretable vision transformer framework for automated brain tumor classification","primary_cat":"cs.CV","submitted_at":"2026-04-23T06:07:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Vision Transformer with CLAHE preprocessing, two-stage fine-tuning, MixUp/CutMix, EMA, TTA, and attention rollout achieves 99.29% accuracy and 99.25% macro F1 on four-class brain tumor MRI classification from 7023 scans.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21153","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Image-Based Malware Type Classification on MalNet-Image Tiny: Effects of Multi-Scale Fusion, Transfer Learning, Data Augmentation, and Schedule-Free Optimization","primary_cat":"cs.CR","submitted_at":"2026-04-22T23:45:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Pretraining plus Mixup/TrivialAugment and a feature pyramid network lift macro-F1 from 0.65 to 0.69 on 43-class malware image classification while cutting training epochs from 96 to 10.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06678","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Wasserstein GAN-based climate scenario generator for risk management and insurance: the case of soil subsidence","primary_cat":"cs.LG","submitted_at":"2026-04-22T08:30:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A conditional Wasserstein GAN generates plausible future SWI drought trajectories for French insurance risk management under climate change.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17914","ref_index":69,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Binary Contrast: Modeling Continuous Skeleton Action Spaces with Transitional Anchors","primary_cat":"cs.CV","submitted_at":"2026-04-20T07:47:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TranCLR models continuous skeleton action spaces with transitional anchors and multi-level manifold calibration, yielding smoother and more accurate representations than binary contrastive methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17219","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PAC-Bayes Bounds for Gibbs Posteriors via Singular Learning Theory","primary_cat":"stat.ML","submitted_at":"2026-04-19T03:00:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PAC-Bayes bounds for Gibbs posteriors are obtained via singular learning theory, producing explicit and tighter posterior-averaged risk bounds that adapt to data structure in overparameterized models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12941","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Direct Discrepancy Replay: Distribution-Discrepancy Condensation and Manifold-Consistent Replay for Continual Face Forgery Detection","primary_cat":"cs.CV","submitted_at":"2026-04-14T16:35:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A replay method for continual face forgery detection condenses real-fake distribution discrepancies into compact maps and synthesizes compatible samples from current real faces to reduce forgetting under tight memory budgets without storing historical images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10754","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Human Gaze-based Dual Teacher Guidance Learning for Semi-Supervised Medical Image Segmentation","primary_cat":"eess.IV","submitted_at":"2026-04-12T17:51:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HG-DTGL integrates human gaze as an extra teacher in mean-teacher learning via GazeMix, MGP module and Gaze Loss, reporting superior segmentation across ten organs on multiple modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10707","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Investigating Bias and Fairness in Appearance-based Gaze Estimation","primary_cat":"cs.CV","submitted_at":"2026-04-12T16:04:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"First large-scale fairness audit of gaze estimators reveals sizable accuracy disparities by ethnicity and gender, with existing mitigation methods providing only marginal fairness gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07962","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Is your algorithm unlearning or untraining?","primary_cat":"cs.LG","submitted_at":"2026-04-09T08:24:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Machine unlearning conflates reversing the influence of specific training examples (untraining) with removing the full underlying distribution or behavior (unlearning).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07763","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Surface Artifacts: Capturing Shared Latent Forgery Knowledge Across Modalities","primary_cat":"cs.CV","submitted_at":"2026-04-09T03:35:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces MAF framework and DeepModal-Bench to capture universal cross-modal forgery traces for better generalization in multimodal deepfake detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05077","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Feature-Aware Anisotropic Local Differential Privacy for Utility-Preserving Graph Representation Learning in Metal Additive Manufacturing","primary_cat":"cs.LG","submitted_at":"2026-04-06T18:29:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FI-LDP-HGAT applies feature-importance-aware anisotropic local differential privacy to a hierarchical graph attention network, recovering 81.5% utility at epsilon=4 and 0.762 defect recall at epsilon=2 on a DED porosity dataset while outperforming standard LDP and DP-SGD baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04012","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OASIC: Occlusion-Agnostic and Severity-Informed Classification","primary_cat":"cs.CV","submitted_at":"2026-04-05T08:02:29+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OASIC uses anomaly-based masking and severity estimation to select occlusion-matched models, improving AUC on occluded images by up to 23.7 points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03993","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can LLMs Learn to Reason Robustly under Noisy Supervision?","primary_cat":"cs.LG","submitted_at":"2026-04-05T06:30:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Online Label Refinement lets LLMs learn robust reasoning from noisy supervision by correcting labels when majority answers show rising rollout success and stable history, delivering 3-4% gains on math and reasoning benchmarks even at high noise levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03803","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"R\\'enyi Attention Entropy for Patch Pruning","primary_cat":"cs.CV","submitted_at":"2026-04-04T17:10:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rényi entropy of attention maps serves as a tunable criterion for pruning redundant patches in vision transformers, reducing compute with preserved accuracy on image recognition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03203","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PR3DICTR: A modular AI framework for medical 3D image-based detection and outcome prediction","primary_cat":"cs.CV","submitted_at":"2026-04-03T17:25:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PR3DICTR is a new open-access modular framework for 3D medical image classification and outcome prediction that works with as little as two lines of code.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03110","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Aspect Knowledge Distillation for Language Model with Low-rank Factorization","primary_cat":"cs.CL","submitted_at":"2026-04-03T15:35:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MaKD distills pre-trained language models by deeply mimicking self-attention and feed-forward modules across aspects using low-rank factorization, matching strong baselines at the same parameter budget and extending to auto-regressive models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02564","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Invariance is Not Enough for Biomedical Domain Generalization and How to Fix It","primary_cat":"eess.IV","submitted_at":"2026-04-02T22:29:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MaskGen improves domain generalization for biomedical image segmentation by using source intensities plus domain-stable foundation model representations with minimal added complexity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.12524","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YOLOv12: Attention-Centric Real-Time Object Detectors","primary_cat":"cs.CV","submitted_at":"2025-02-18T04:20:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"YOLOv12 is a new attention-based real-time object detector that reports higher accuracy than YOLOv10, YOLOv11, and RT-DETR variants at comparable or better speed and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07815","ref_index":99,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Chronos: Learning the Language of Time Series","primary_cat":"cs.LG","submitted_at":"2024-03-12T16:53:54+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chronos pretrains transformer models on tokenized time series to deliver strong zero-shot forecasting across diverse domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.08471","ref_index":87,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Revisiting Feature Prediction for Learning Visual Representations from Video","primary_cat":"cs.CV","submitted_at":"2024-02-15T18:59:11+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"V-JEPA models trained only on feature prediction from 2 million public videos achieve 81.9% on Kinetics-400, 72.2% on Something-Something-v2, and 77.9% on ImageNet-1K using frozen ViT-H/16 backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2006.07397","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The DeepFake Detection Challenge (DFDC) Dataset","primary_cat":"cs.CV","submitted_at":"2020-06-12T18:15:55+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The DFDC dataset is the largest public collection of face-swapped videos and supports detectors that generalize to in-the-wild deepfakes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2004.10934","ref_index":92,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"YOLOv4: Optimal Speed and Accuracy of Object Detection","primary_cat":"cs.CV","submitted_at":"2020-04-23T02:10:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"YOLOv4 achieves 43.5% AP (65.7% AP50) on MS COCO at ~65 FPS on Tesla V100 by integrating WRC, CSP, CmBN, SAT, Mish activation, Mosaic augmentation, DropBlock, and CIoU loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}