{"total":165,"items":[{"citing_arxiv_id":"2605.13638","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CO-MAP: A Reinforcement Learning Approach to the Qubit Allocation Problem","primary_cat":"quant-ph","submitted_at":"2026-05-13T15:04:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reinforcement learning policy for qubit mapping reduces SWAP overhead by 65-85% versus standard quantum compilers on MQTBench and Queko benchmark circuits.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12492","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pion: A Spectrum-Preserving Optimizer via Orthogonal Equivalence Transformation","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pion is an optimizer that preserves the singular values of weight matrices in LLM training by applying orthogonal equivalence transformations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12391","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Trajectory-Agnostic Asteroid Detection in TESS with Deep Learning","primary_cat":"astro-ph.EP","submitted_at":"2026-05-12T16:56:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A W-Net deep learning model detects asteroids in TESS data independently of trajectory by rotating training image cubes and using adaptive normalization for data scaling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12365","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QAP-Router: Tackling Qubit Routing as Dynamic Quadratic Assignment with Reinforcement Learning","primary_cat":"quant-ph","submitted_at":"2026-05-12T16:34:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QAP-Router models qubit routing as dynamic QAP and applies RL with a solution-aware Transformer to cut CNOT counts by 12-30% versus industry compilers on real circuit benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11131","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"USEMA: a Scalable Efficient Mamba Like Attention for Medical Image Segmentation","primary_cat":"cs.CV","submitted_at":"2026-05-11T18:40:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"USEMA is a hybrid UNet architecture merging CNNs with scalable Mamba-like attention (SEMA) that achieves better efficiency than transformers and superior segmentation accuracy than pure CNN or Mamba models across medical imaging modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10933","ref_index":192,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DECO: Sparse Mixture-of-Experts with Dense-Comparable Performance on End-Side Devices","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:58:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DECO matches dense model performance at 20% expert activation via ReLU-based routing with learnable scaling and the NormSiLU activation, plus a 3x real-hardware speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10905","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TLX: Hardware-Native, Evolvable MIMW GPU Compiler for Large-scale Production Environments","primary_cat":"cs.AR","submitted_at":"2026-05-11T17:46:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TLX is a Triton extension that exposes multi-warp, asynchronous, and cluster-level controls for modern GPUs, delivering competitive performance with low programmer effort and production deployment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10886","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LoKA: Low-precision Kernel Applications for Recommendation Models At Scale","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:32:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LoKA enables practical FP8 use in numerically sensitive large recommendation models via profiling, model adaptations, and runtime kernel orchestration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10640","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Understanding Continual Factual Knowledge Acquisition of Language Models: From Theory to Algorithm","primary_cat":"cs.CL","submitted_at":"2026-05-11T14:28:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Theoretical analysis of continual factual knowledge acquisition shows data replay stabilizes pretrained knowledge by shifting convergence dynamics while regularization only slows forgetting, leading to the STOC method for attention-based replay selection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10504","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Less Is More: Premature Upper-Layer Attention Specialization Hurts Language Model Pretraining","primary_cat":"cs.CL","submitted_at":"2026-05-11T13:01:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Temporarily reducing the learning rate on upper-layer query and key projections during early GPT pretraining prevents premature attention specialization and improves model performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10484","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OpenSGA: Efficient 3D Scene Graph Alignment in the Open World","primary_cat":"cs.CV","submitted_at":"2026-05-11T12:44:18+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OpenSGA fuses vision-language, textual, and geometric features via a distance-gated attention encoder and minimum-cost-flow allocator to outperform prior methods on both frame-to-scan and subscan-to-subscan 3D scene graph alignment, backed by a new 700k-sample ScanNet-SG dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10260","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Meta-Black-Box Optimization Can Do Search Guidance for Expensive Constrained Multi-Objective Optimization","primary_cat":"cs.NE","submitted_at":"2026-05-11T09:25:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MetaSG-SAEA is a bi-level meta-BBO framework that uses a meta-policy for search guidance via the MM-CCI constraint abstraction and diffusion-based population initialization to outperform baselines on expensive constrained multi-objective optimization problems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09687","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spatial-Frequency Gated Swin Transformer for Remote Sensing Single-Image Super-Resolution","primary_cat":"cs.CV","submitted_at":"2026-05-10T18:18:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SFG-SwinSR improves PSNR to 45.19 dB and SSIM to 0.9852 on SpaceNet by adding a depthwise-blur plus gated spatial branch inside each Swin2SR feed-forward network.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09204","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LBI: Parallel Scan Backpropagation via Latent Bounded Interfaces","primary_cat":"cs.LG","submitted_at":"2026-05-09T22:46:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LBI enables tractable parallel backpropagation by reducing inter-region adjoint computation to low-dimensional r x r Jacobians while preserving exact gradients under a bounded-interface model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09165","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sparse Layers are Critical to Scaling Looped Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-09T20:58:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Looped MoE models scale better than standard transformers because different experts activate on each loop pass, recovering expressivity without extra parameters, and support superior early exits.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09112","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Contextual Plackett-Luce: An Efficient Neural Model for Probabilistic Sequence Selection under Ambiguity","primary_cat":"cs.LG","submitted_at":"2026-05-09T18:52:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contextual Plackett-Luce extends the classical Plackett-Luce model with context-dependent Ising parameterization to enable efficient parallel scoring followed by incremental autoregressive selection for ambiguous sequence tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08878","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Do Aligned LLMs Remain Jailbreakable: Refusal-Escape Directions, Operator-Level Sources, and Safety-Utility Trade-off","primary_cat":"cs.CR","submitted_at":"2026-05-09T10:57:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Aligned LLMs exhibit Refusal-Escape Directions (RED) that enable refusal-to-answer transitions via input perturbations; these directions decompose exactly into operator-level sources, creating an inherent safety-utility trade-off when trying to eliminate them.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07940","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Delta-Adapter: Scalable Exemplar-Based Image Editing with Single-Pair Supervision","primary_cat":"cs.CV","submitted_at":"2026-05-08T16:09:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Delta-Adapter extracts a semantic delta from a single image pair via a pre-trained vision encoder and injects it through a Perceiver adapter to enable scalable single-pair supervised editing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07779","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Neural network quantum states in the grand canonical ensemble","primary_cat":"quant-ph","submitted_at":"2026-05-08T14:18:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new neural quantum state ansatz for bosons in the grand canonical ensemble achieves competitive variational energies in 1D and 2D systems and provides access to one-body reduced density matrices.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07772","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training-Induced Escape from Token Clustering in a Mean-Field Formulation of Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-08T14:12:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Training a mean-field Transformer under L2 regularization induces an escape from attention-driven token clustering in later layers after initial clustering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07375","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QuadNorm: Resolution-Robust Normalization for Neural Operators","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:30:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QuadNorm uses quadrature-based moments instead of uniform averaging in normalization layers, achieving O(h²) consistency across resolutions and better cross-resolution transfer in neural operators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07167","ref_index":142,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GPROF-IR: An Improved Single-Channel Infrared Precipitation Retrieval for Merged Satellite Precipitation Products","primary_cat":"physics.ao-ph","submitted_at":"2026-05-08T03:02:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GPROF-IR is a CNN-based retrieval that uses temporal context in geostationary IR observations to produce precipitation estimates with lower error than prior IR methods and climatological consistency with PMW retrievals for integration into IMERG V08.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07113","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Solving Max-Cut to Global Optimality via Feasibility-Preserving Graph Neural Networks","primary_cat":"cs.LG","submitted_at":"2026-05-08T01:41:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A Max-Cut-specific graph neural network predicts primal- and dual-feasible SDP solutions in linearithmic time, cutting bounding costs in exact branch-and-bound by up to 10.6 times versus a commercial SDP solver while training without any solved SDP labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07097","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Feedforward Neural Network Definable in an o-Minimal Structure Has Finite Sample Complexity","primary_cat":"stat.ML","submitted_at":"2026-05-08T01:26:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Every fixed finite feedforward neural network definable in an o-minimal structure has finite sample complexity in the agnostic PAC setting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07063","ref_index":118,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dr. Post-Training: A Data Regularization Perspective on LLM Post-Training","primary_cat":"cs.LG","submitted_at":"2026-05-08T00:16:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dr. Post-Training reframes general data as a data-induced regularizer for LLM post-training updates, yielding a family of methods that outperform data-selection baselines on SFT, RLHF, and RLVR tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06611","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Structural Origin of Attention Sink: Variance Discrepancy, Super Neurons, and Dimension Disparity","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:28:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attention sinks arise from variance discrepancy in self-attention value aggregation, amplified by super neurons and first-token dimension disparity, and can be mitigated by head-wise RMSNorm to accelerate pre-training convergence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06384","ref_index":38,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MinMax Recurrent Neural Cascades","primary_cat":"cs.LG","submitted_at":"2026-05-07T15:01:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MinMax RNCs are recurrent neural models using min-max recurrence that achieve full regular-language expressivity, logarithmic parallel evaluation, uniformly bounded states, and constant state gradients independent of time distance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06104","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Autoregressive RTG: Conditioning via Injection Outside Sequential Modeling in Decision Transformer","primary_cat":"cs.LG","submitted_at":"2026-05-07T12:20:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Injecting RTG into states outside the autoregressive sequence yields shorter, more efficient Decision Transformers that outperform the original on offline RL tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06095","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Metonymy in vision models undermines attention-based interpretability","primary_cat":"cs.CV","submitted_at":"2026-05-07T12:14:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pretrained vision transformers exhibit strong intra-object leakage where each part representation encodes information from the entire object, undermining the faithfulness of attention-based part-centric interpretability methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06729","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The E$\\Delta$-MHC-Geo Transformer: Adaptive Geodesic Operations with Guaranteed Orthogonality","primary_cat":"cs.LG","submitted_at":"2026-05-07T11:37:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The EΔ-MHC-Geo Transformer achieves input-adaptive unconditionally orthogonal residual connections via a Cayley-based rotation that works for all parameters, combined with a learned hybrid gate for reflections.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05863","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SOPE: Stabilizing Off-Policy Evaluation for Online RL with Prior Data","primary_cat":"cs.LG","submitted_at":"2026-05-07T08:32:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SOPE uses an actor-aligned OPE signal on a held-out validation split to dynamically stop offline stabilization phases in online RL, improving performance up to 45.6% and cutting TFLOPs up to 22x on 25 Minari tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05294","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Competing nonlinearities, criticality, and order-to-chaos transition in deep networks","primary_cat":"cond-mat.dis-nn","submitted_at":"2026-05-06T18:00:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A statistical mixture of Tanh and Swish activations with critical mixing fraction p_c induces a continuous phase transition to scale-invariant signal propagation in deep networks while preserving smoothness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05113","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Long Does Infinite Width Last? Signal Propagation in Long-Range Linear Recurrences","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:44:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In linear recurrent models, infinite-width signal propagation remains accurate only for depths t much smaller than sqrt(width n), with a critical regime at t ~ c sqrt(n) where finite-width effects emerge and dominate for larger t.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04952","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Adaptive Inverted-Index Routing for Granular Mixtures-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-05-06T14:15:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AIR-MoE introduces a two-stage inverted-index routing method based on vector quantization that approximates optimal expert selection for granular MoE models at lower cost and with empirical performance gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04769","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Lightweight Cross-Spectral Face Recognition via Contrastive Alignment and Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-06T11:16:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A lightweight hybrid CNN-Transformer framework for heterogeneous face recognition achieves competitive performance on cross-spectral benchmarks and standard RGB tasks using contrastive alignment and distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04744","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MixINN: Accelerating Plant Breeding by Combining Mixed Models and Deep Learning for Interaction Prediction","primary_cat":"cs.LG","submitted_at":"2026-05-06T10:48:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MixINN combines mixed models and deep learning to predict genotype-environment interactions in corn trials, yielding 5.8-7.2% higher average yields when selecting top-performing genotypes compared to standard methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04581","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GTF: Omnidirectional EPI Transformer for Light Field Super-Resolution","primary_cat":"cs.CV","submitted_at":"2026-05-06T07:31:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GTF is an omnidirectional EPI Transformer for light field super-resolution that models horizontal, vertical, 45-degree and 135-degree epipolar geometries, reaching 32.78 dB on benchmarks and top ranks in the NTIRE 2026 challenge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04418","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Demystifying Manifold Constraints in LLM Pre-training","primary_cat":"cs.LG","submitted_at":"2026-05-06T02:22:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Manifold constraints via the new MACRO optimizer independently bound activation scales and enforce rotational equilibrium in LLM pre-training, subsuming RMS normalization and decoupled weight decay while delivering competitive performance with convergence guarantees.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04368","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Extending Differential Temporal Difference Methods for Episodic Problems","primary_cat":"cs.LG","submitted_at":"2026-05-06T00:10:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A generalization of differential TD extends it to episodic settings while preserving policy ordering, inheriting linear TD guarantees, and improving sample efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04279","ref_index":2,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Gradient Flow Structure and Quantitative Dynamics of Multi-Head Self-Attention","primary_cat":"cs.LG","submitted_at":"2026-05-05T20:31:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-head self-attention dynamics admit a non-decreasing energy functional under suitable score-matrix conditions, with closed-form clustering thresholds and monotonic entropy production in simplified regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03929","ref_index":43,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PHALAR: Phasors for Learned Musical Audio Representations","primary_cat":"cs.SD","submitted_at":"2026-05-05T16:19:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PHALAR achieves up to 70% relative accuracy gain in stem retrieval with under half the parameters and 7x faster training by using phasor-based equivariant representations, setting new SOTA on multiple datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03769","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nora: Normalized Orthogonal Row Alignment for Scalable Matrix Optimizer","primary_cat":"cs.LG","submitted_at":"2026-05-05T14:00:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Nora is a matrix optimizer that stabilizes weight norms and angular velocities through row-wise momentum projection onto the orthogonal complement of the weights while approximating structured preconditioning with O(mn) complexity and proven scalability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03389","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Near-field Wideband Multi-User Localization using NFMR-Net","primary_cat":"eess.SP","submitted_at":"2026-05-05T05:50:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NFMR-Net refines coarse LoS-based range and angle estimates from ZC-pilot channel matrices and 2D MUSIC to outperform standard 2D MUSIC in wideband near-field multi-user localization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08171","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Communication Dynamics Neural Networks: FFT-Diagonalized Layers for Improved Hessian Conditioning at Reduced Parameter Count","primary_cat":"cs.LG","submitted_at":"2026-05-04T23:43:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CDLinear layers achieve population Hessian condition number exactly 1 under pre-whitening, deliver 3.8x parameter reduction versus dense layers at 0.65% accuracy cost, and show 310x better empirical conditioning on an MLP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03086","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"iGENE: A Differentiable Flux-Tube Gyrokinetic Code in TensorFlow","primary_cat":"physics.plasm-ph","submitted_at":"2026-05-04T19:00:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A fully differentiable TensorFlow gyrokinetic code allows approximate gradients of nonlinear turbulence quantities to be used for outer-loop tasks such as profile prediction despite stochasticity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03058","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Neuron-Anchored Rule Extraction for Large Language Models via Contrastive Hierarchical Ablation","primary_cat":"cs.LG","submitted_at":"2026-05-04T18:27:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MechaRule localizes agonist neurons in LLMs via contrastive hierarchical ablation to ground rule extraction in circuitry, recalling 96.8% of high-effect neurons and reducing task performance when suppressed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02772","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Linearizing Vision Transformer with Test-Time Training","primary_cat":"cs.CV","submitted_at":"2026-05-04T16:16:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Using Test-Time Training's structural match to Softmax attention plus key normalization and locality modules allows inheriting pretrained weights and fine-tuning Stable Diffusion 3.5 in one hour to match quality while speeding inference 1.32-1.47x.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02657","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CARD: Coarse-to-fine Autoregressive Modeling with Radix-based Decomposition for Transferable Free Energy Estimation","primary_cat":"cs.LG","submitted_at":"2026-05-04T14:38:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CARD uses radix decomposition to enable autoregressive modeling of molecular coordinates as a zero-free-energy reference distribution, delivering classical accuracy for absolute free energy on unseen systems at ~40x speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02098","ref_index":70,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Spherical to Gaussian: A Comparative Analysis of Point Cloud Cropping Strategies in Large-Scale 3D Environments","primary_cat":"cs.CV","submitted_at":"2026-05-03T23:36:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Gaussian and linear cropping strategies for large point clouds improve 3D neural network performance over spherical crops, especially in outdoor scenes, and achieve new state-of-the-art results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02075","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Graph Transformers and Stabilized Reinforcement Learning for Large-Scale Dynamic Routing Modulation and Spectrum Allocation in Elastic Optical Networks","primary_cat":"cs.NI","submitted_at":"2026-05-03T22:26:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Graph transformer RL for dynamic RMSA supports up to 13% more traffic than benchmarks on networks up to 143 nodes and 362 links.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}