{"total":158,"items":[{"citing_arxiv_id":"2605.13734","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KVServe: Service-Aware KV Cache Compression for Communication-Efficient Disaggregated LLM Serving","primary_cat":"cs.DC","submitted_at":"2026-05-13T16:12:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KVServe delivers up to 9.13x job completion time speedup and 32.8x time-to-first-token reduction by making KV cache compression service-aware and adaptive in disaggregated LLM serving.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13537","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Temper and Tilt Lead to SLOP: Reward Hacking Mitigation with Inference-Time Alignment","primary_cat":"cs.LG","submitted_at":"2026-05-13T13:47:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Temperature adjustment on the reference model generalizes inference-time alignment to SLOP ensembles of reward models, with a calibration algorithm that improves robustness to reward hacking while preserving alignment performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13429","ref_index":82,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TokAlign++: Advancing Vocabulary Adaptation via Better Token Alignment","primary_cat":"cs.CL","submitted_at":"2026-05-13T12:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TokAlign++ learns token alignments between LLM vocabularies from monolingual representations to enable faster adaptation, better text compression, and effective token-level distillation across 15 languages with minimal steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12968","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Controlling Logical Collapse in LLMs via Algebraic Ontology Projection over F2","primary_cat":"cs.LG","submitted_at":"2026-05-13T04:01:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Projecting LLM hidden states onto F2 algebra with 42 pairs yields 93% zero-shot accuracy on logical relations and identifies prompt-preventable late-layer collapse.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12667","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ODRPO: Ordinal Decompositions of Discrete Rewards for Robust Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-12T19:17:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ODRPO decomposes discrete rewards into ordinal binary indicators to compute independent advantages and reduce noise corruption in RLAIF policy optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12335","ref_index":65,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EHR-RAGp: Retrieval-Augmented Prototype-Guided Foundation Model for Electronic Health Records","primary_cat":"cs.IR","submitted_at":"2026-05-12T16:17:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EHR-RAGp is a retrieval-augmented EHR foundation model that employs prototype-guided retrieval to dynamically integrate relevant historical patient context, outperforming prior models on clinical prediction tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12332","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Automated Air Traffic Safety Assessment Around Non-Towered Airports Using Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-12T16:15:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Large language models achieve macro F1 scores above 0.85 on binary nominal-versus-danger classification from CTAF radio transcripts and METAR weather data using a new synthetic dataset with a 12-category hazard taxonomy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11906","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YFPO: A Preliminary Study of Yoked Feature Preference Optimization with Neuron-Guided Rewards for Mathematical Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-12T10:18:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"YFPO augments standard preference optimization with neuron-level activation margins from math-related features to improve LLM reasoning on math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11775","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Entropy Polarity in Reinforcement Fine-Tuning: Direction, Asymmetry, and Control","primary_cat":"cs.LG","submitted_at":"2026-05-12T08:47:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Entropy polarity from a first-order entropy change approximation enables Polarity-Aware Policy Optimization (PAPO) that preserves complementary polarity branches and outperforms baselines on math and agentic RL fine-tuning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11774","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Token to Token Pair: Efficient Prompt Compression for Large Language Models in Clinical Prediction","primary_cat":"cs.CL","submitted_at":"2026-05-12T08:46:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedTPE compresses EHR token sequences by up to 31% via merging common medical token pairs, reducing LLM inference latency 34-63% while maintaining or improving performance on mortality and phenotyping tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11712","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Stable Value Alignment: Introducing Independent Modules for Consistent Value Guidance","primary_cat":"cs.AI","submitted_at":"2026-05-12T08:02:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SVGT adds independent value modules and Bridge Tokens to LLMs to maintain consistent value guidance, cutting harmful outputs by over 70% in tests while preserving fluency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11453","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Predictive Maps of Multi-Agent Reasoning: A Successor-Representation Spectrum for LLM Communication Topologies","primary_cat":"cs.MA","submitted_at":"2026-05-12T03:11:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Spectral features of the successor representation matrix for multi-agent LLM communication topologies predict robustness to perturbations, consensus formation, and error accumulation, with an extension to account for bias drift.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11416","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Freeze Deep, Train Shallow: Interpretable Layer Allocation for Continued Pre-Training","primary_cat":"cs.CL","submitted_at":"2026-05-12T02:11:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Freezing deep layers and training shallow layers during continued pre-training of LLMs outperforms full fine-tuning and the opposite allocation on C-Eval and CMMLU, guided by a new layer-sensitivity diagnostic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11303","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Predicting Psychological Well-Being from Spontaneous Speech using LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-11T22:46:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs achieve Spearman correlations up to 0.8 for zero-shot Ryff PWB prediction from spontaneous speech, with added statistical and linguistic explainability analyses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10817","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CLEF: EEG Foundation Model for Learning Clinical Semantics","primary_cat":"cs.AI","submitted_at":"2026-05-11T16:34:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLEF, a long-context EEG foundation model using 3D multitaper spectrograms and contrastive alignment with reports and EHR, beats prior models on 229 of 234 clinical tasks and raises mean AUROC from 0.65 to 0.74.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10639","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Navigating the Sea of LLM Evaluation: Investigating Bias in Toxicity Benchmarks","primary_cat":"cs.AI","submitted_at":"2026-05-11T14:27:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Toxicity benchmarks for LLMs produce inconsistent results when task type, input domain, or model changes, revealing intrinsic evaluation biases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10499","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Privacy-preserving Chunk Scheduling in a BitTorrent Implementation of Federated Learning","primary_cat":"cs.DC","submitted_at":"2026-05-11T12:57:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FLTorrent achieves within-round source unlinkability in decentralized federated learning via a BitTorrent warm-up with pre-round obfuscation, randomized lags, and coordination-only non-owner-first scheduling, reaching 92% of bandwidth-optimal efficiency and near-random attribution success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10050","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EchoPrune: Interpreting Redundancy as Temporal Echoes for Efficient VideoLLMs","primary_cat":"cs.CV","submitted_at":"2026-05-11T06:23:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EchoPrune prunes video tokens via query relevance and temporal reconstruction error to let VideoLLMs handle up to 20x more frames under fixed budget with reported gains in accuracy and speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10043","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Personalizing LLMs with Binary Feedback: A Preference-Corrected Optimization Framework","primary_cat":"cs.CL","submitted_at":"2026-05-11T06:12:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"C-BPO personalizes LLMs via preference-calibrated binary signals and PU learning theory to isolate inter-user differences from shared task knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09997","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GraphInstruct: A Progressive Benchmark for Diagnosing Capability Gaps in LLM Graph Generation","primary_cat":"cs.SI","submitted_at":"2026-05-11T05:19:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GraphInstruct is a progressive benchmark with six complexity levels for LLM graph generation that identifies multi-constraint composition as the hardest point and shows a verification-guided iterative framework outperforms standard prompting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09463","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Position Bias: Shifting Context Compression from Position-Driven to Semantic-Driven","primary_cat":"cs.CL","submitted_at":"2026-05-10T10:27:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SeCo performs semantic-driven context compression for LLMs by anchoring on query-relevant semantic centers and applying consistency-weighted token merging, yielding better downstream performance, lower latency, and stronger out-of-domain robustness than position-based methods across 14 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09042","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evaluating Pragmatic Reasoning in Large Language Models: Evidence from Scalar Diversity","primary_cat":"cs.CL","submitted_at":"2026-05-09T16:28:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pragmatic reasoning in LLMs varies substantially by evaluation method and model family, with scalar diversity patterns appearing only in certain conditions rather than reflecting stable competence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09005","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Backdoor-Based Ownership Verification for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-09T15:44:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GuardVLA embeds a stealthy backdoor watermark in VLAs via secret messages in visual data and uses a swap-and-detect mechanism for post-release ownership verification that preserves task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08423","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Queryable LoRA: Instruction-Regularized Routing Over Shared Low-Rank Update Atoms","primary_cat":"cs.LG","submitted_at":"2026-05-08T19:32:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Queryable LoRA adds dynamic routing over shared low-rank atoms with attention and language-instruction regularization to make parameter-efficient fine-tuning more adaptive across inputs and layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10971","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Steering Without Breaking: Mechanistically Informed Interventions for Discrete Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T18:52:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Adaptive scheduling of interventions in discrete diffusion language models, timed to attribute-specific commitment schedules discovered with sparse autoencoders, delivers precise multi-attribute steering up to 93% strength while preserving generation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07977","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Play Enhancement via Advantage-Weighted Refinement in Online Federated LLM Fine-Tuning with Real-Time Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-08T16:35:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SPEAR enables online federated LLM fine-tuning by using feedback-guided self-play to create contrastive pairs trained with maximum likelihood on correct completions and confidence-weighted unlikelihood on incorrect ones, outperforming baselines without ground-truth contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07883","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond \"I cannot fulfill this request\": Alleviating Rigid Rejection in LLMs via Label Enhancement","primary_cat":"cs.CL","submitted_at":"2026-05-08T15:33:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LANCE applies variational inference for label enhancement across multiple rejection categories, supplying gradients to a refinement model that produces safe, non-rigid responses from LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07783","ref_index":97,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Chain-based Distillation for Effective Initialization of Variable-Sized Small Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-08T14:21:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chain-based Distillation constructs a sequence of anchor models to enable efficient initialization of variable-sized SLMs through interpolation, with bridge distillation for cross-architecture transfer, yielding better performance than scratch training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07443","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RcLLM: Accelerating Generative Recommendation via Beyond-Prefix KV Caching","primary_cat":"cs.DC","submitted_at":"2026-05-08T08:47:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RcLLM accelerates generative recommendation inference by 1.31x-9.51x in TTFT through beyond-prefix KV caching, replicated user caches, sharded item caches, affinity scheduling, and selective attention with negligible accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07403","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Boosting Automatic Java-to-Cangjie Translation with Multi-Stage LLM Training and Error Repair","primary_cat":"cs.SE","submitted_at":"2026-05-08T07:58:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Multi-stage LLM training plus compiler-guided error repair boosts functional equivalence in Java-to-Cangjie translation by 6.06% over prior methods despite scarce parallel data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07342","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mage: Multi-Axis Evaluation of LLM-Generated Executable Game Scenes Beyond Compile-Pass Rate","primary_cat":"cs.LG","submitted_at":"2026-05-08T06:46:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mage shows compile-pass rate is anti-correlated with functional correctness in LLM game scene generation; direct NL-to-C# yields 43% runtime but F1~0.12 structure, while IR conditioning recovers structure (F1 up to 1.0) but halves runtime, with granularity levels statistically equivalent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07338","ref_index":98,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ShellfishNet: A Domain-Specific Benchmark for Visual Recognition of Marine Molluscs","primary_cat":"cs.CV","submitted_at":"2026-05-08T06:42:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ShellfishNet is a new benchmark of 8,691 images across 32 mollusc taxa for evaluating vision models on real-world underwater ecological monitoring tasks including robustness to degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07331","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Importance Sampling in LLM Policy Optimization: A Cumulative Token Perspective","primary_cat":"cs.LG","submitted_at":"2026-05-08T06:35:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The cumulative token IS ratio gives unbiased prefix correction and lower variance than full-sequence ratios for token-level gradients in LLM policy optimization, enabling CTPO to outperform GRPO and GSPO baselines on mathematical reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07271","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding Performance Collapse in Layer-Pruned Large Language Models via Decision Representation Transitions","primary_cat":"cs.CL","submitted_at":"2026-05-08T05:35:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Performance collapse in layer-pruned LLMs stems from disrupting the Silent Phase of decision-making, which blocks the transition to correct predictions, while the later Decisive Phase is robust to pruning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07193","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Coupling Models for One-Step Discrete Generation","primary_cat":"cs.LG","submitted_at":"2026-05-08T03:40:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Coupling Models enable single-step discrete sequence generation via learned couplings to Gaussian latents and outperform prior one-step baselines on text perplexity, biological FBD, and image FID metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06523","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Implicit Reward Overfitting and the Low-rank Dynamics in RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:30:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RLVR exhibits implicit reward overfitting to training data and optimizes heavy-tailed singular spectra with rank-1 focus on reasoning capability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06374","ref_index":53,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ResiHP: Taming LLM Training Failures with Dynamic Hybrid Parallelism","primary_cat":"cs.DC","submitted_at":"2026-05-07T14:52:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ResiHP introduces a workload-aware failure detector and dynamic scheduler for hybrid-parallel LLM training that achieves 1.04-4.39x higher throughput than prior resilient systems under failures on a 256-GPU cluster.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06230","ref_index":70,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Safactory: A Scalable Agentic Infrastructure for Training Trustworthy Autonomous Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-07T13:21:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Safactory integrates three platforms for simulation, data management, and agent evolution to create a unified pipeline for training trustworthy autonomous AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06733","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Factor Aggregation: Gauge-Aware Low-Rank Server Representations for Federated LoRA","primary_cat":"cs.LG","submitted_at":"2026-05-07T12:57:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GLoRA replaces raw factor averaging with gauge-aware aggregation in a consensus subspace estimated from client projectors, enabling consistent low-rank federated LoRA under heterogeneity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06078","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Milestone-Guided Policy Learning for Long-Horizon Language Agents","primary_cat":"cs.CL","submitted_at":"2026-05-07T12:00:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BEACON uses milestone partitioning, temporal reward shaping, and dual-scale advantage estimation to nearly double success rates on long-horizon ALFWorld tasks while raising effective sample use from 23.7% to 82%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05893","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Logic-Regularized Verifier Elicits Reasoning from LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-07T09:03:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LOVER creates an unsupervised logic-regularized verifier that reaches 95% of supervised verifier performance on reasoning tasks across 10 datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05699","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Quantization Is Free: An int4 KV Cache That Outruns fp16 on Apple Silicon","primary_cat":"cs.PF","submitted_at":"2026-05-07T05:44:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A single fused int4 KV cache kernel on Apple Silicon outperforms fp16 in latency with 3x memory compression and near-zero quality loss on tested models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04992","ref_index":141,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"You Snooze, You Lose: Automatic Safety Alignment Restoration through Neural Weight Translation","primary_cat":"cs.CR","submitted_at":"2026-05-06T14:52:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NeWTral is a non-linear weight translation framework using MoE routing that reduces average attack success rate from 70% to 13% on unsafe domain adapters across Llama, Mistral, Qwen, and Gemma models up to 72B while retaining 90% knowledge fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04913","ref_index":28,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Local Learning: A Cheaper and Faster Recipe for LLM Post-Training","primary_cat":"cs.CL","submitted_at":"2026-05-06T13:41:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LoPT achieves competitive task performance in LLM post-training by limiting task gradients to the upper model half and training the lower half with local feature reconstruction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05267","ref_index":140,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging Generation and Training: A Systematic Review of Quality Issues in LLMs for Code","primary_cat":"cs.SE","submitted_at":"2026-05-06T09:38:31+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A review of 114 studies creates taxonomies for code and data quality issues, formalizes 18 propagation mechanisms from training data defects to LLM-generated code defects, and synthesizes detection and mitigation techniques.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Survey[51], Uncovering Pretraining Code in LLMs[64] Robustness Less Is More[146], MultiCodeIF[25] Maintainability Less Is More[146], Quality In, Quality Out[45], SwallowCode[28], Seed-Coder[110], Synthetic Data Generation[86], Cracks in The Stack[48], DataRecipe[55], MultiCodeIF[25] UnderstandabilityLess Is More[146], Seed-Coder[110], Benchmark Builders[140], Synthetic Data Generation[86], MG-Verilog[150], DataRecipe[55], MultiCodeIF[25] Efficiency Quality In, Quality Out[45], SwallowCode[28], MultiCodeIF[25] Non-Code Attribute Quality Issues Compliance and Security Risks (Textual) DESEC[89], CodeMI[124], CodeCipher[68], Datasets for Large Language Models[75], AiXcoder-7B[52] Distribution Imbalance Issues"},{"citing_arxiv_id":"2605.04495","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CAR: Query-Guided Confidence-Aware Reranking for Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-05-06T04:51:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAR reranks documents in RAG by promoting those that increase generator confidence (via answer consistency sampling) and demoting those that decrease it, yielding NDCG@5 gains on BEIR datasets that correlate with F1 improvements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03927","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StateVLM: A State-Aware Vision-Language Model for Robotic Affordance Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-05T16:19:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StateVLM uses an Auxiliary Regression Loss on box decoder outputs to boost VLMs' accuracy on object and state localization for robotic affordance reasoning, with gains of 1.6% on RefCOCO variants and 5.2% on the new OSAR benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03562","ref_index":2,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HeadQ: Model-Visible Distortion and Score-Space Correction for KV-Cache Quantization","primary_cat":"cs.LG","submitted_at":"2026-05-05T09:34:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HeadQ reduces 84-94% of excess perplexity in 2-bit key quantization by adding low-rank logit corrections in a calibration-learned query basis, with further gains from an A^2-weighted value policy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08175","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KARMA-MV: A Benchmark for Causal Question Answering on Music Videos","primary_cat":"cs.CV","submitted_at":"2026-05-05T06:48:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KARMA-MV is a new benchmark showing that causal knowledge graphs improve VLMs on causal audio-visual reasoning in music videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02363","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Correct Isn't Usable: Improving Structured Output Reliability in Small Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-04T09:07:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AloLab, an iterative meta-agent prompt optimizer, raises structured output accuracy for 7-9B models from 0% to 84-87% on GSM8K while preserving near-native inference speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}