{"total":116,"items":[{"citing_arxiv_id":"2605.13779","ref_index":17,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MinT: Managed Infrastructure for Training and Serving Millions of LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-13T16:59:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MinT enables efficient management of million-scale LoRA-adapted LLM policies over shared 1T-parameter base models by moving only small adapters through training and serving pipelines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13247","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EMO: Frustratingly Easy Progressive Training of Extendable MoE","primary_cat":"cs.LG","submitted_at":"2026-05-13T09:31:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EMO progressively expands the expert pool in MoE models using scaling-law-derived token budgets per stage, matching fixed-expert performance while cutting wall-clock time and GPU cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13137","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LeanSearch v2: Global Premise Retrieval for Lean 4 Theorem Proving","primary_cat":"cs.IR","submitted_at":"2026-05-13T08:04:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LeanSearch v2 recovers 46.1% of ground-truth premise groups on research-level Mathlib theorems and raises fixed-loop proof success from 4% to 20% via embedding-reranker plus iterative sketch-retrieve-reflect retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13050","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Context Training with Active Information Seeking","primary_cat":"cs.CL","submitted_at":"2026-05-13T06:15:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adding active search tools to LLM context optimization works only when combined with a multi-candidate search-based training procedure that prunes contexts, delivering gains across low-resource translation, health, and reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13037","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MAP: A Map-then-Act Paradigm for Long-Horizon Interactive Agent Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-13T05:46:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAP improves LLM agent reasoning by constructing a structured cognitive map of the environment before task execution, yielding performance gains on benchmarks like ARC-AGI-3 and superior training data via the new MAP-2K dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12500","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SenseNova-U1: Unifying Multimodal Understanding and Generation with NEO-unify Architecture","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SenseNova-U1 presents native unified multimodal models that match top understanding VLMs while delivering strong performance in image generation, infographics, and interleaved tasks via the NEO-unify architecture.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"C-eval: A multi-level multi-discipline chinese evaluation suite for foundation models. Advances in Neural Information Processing Systems, 36:62991-63010, 2023. [56] Aaron Hurst, Adam Lerer, Adam P Goucher, Adam Perelman, Aditya Ramesh, Aidan Clark, AJ Ostrow, Akila Welihinda, Alan Hayes, Alec Radford, et al. Gpt-4o system card. arXiv preprint arXiv:2410.21276, 2024. [57] Kimi Team. Kimi K2: Open agentic intelligence. arXiv preprint arXiv:2507.20534, 2025. [58] Diederik P Kingma and Max Welling. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114, 2013. [59] Kuaishou Kolors Team. Kolors 2.0, 2025. URLhttps://kolors.kuaishou.com/. 37 [60] Black Forest Labs. Flux, 2024. URLhttps://github.com/black-forest-labs/flux."},{"citing_arxiv_id":"2605.12492","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pion: A Spectrum-Preserving Optimizer via Orthogonal Equivalence Transformation","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pion is an optimizer that preserves the singular values of weight matrices in LLM training by applying orthogonal equivalence transformations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11854","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Distilled Trajectory-Aware Boltzmann Modeling: Bridging the Training-Inference Discrepancy in Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T09:39:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TABOM models inference unmasking preferences as a Boltzmann distribution over predictive entropies and derives a ranking loss to align DLM training with observed trajectories, yielding gains in new domains and reduced catastrophic forgetting versus standard SFT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11723","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CaC: Advancing Video Reward Models via Hierarchical Spatiotemporal Concentrating","primary_cat":"cs.CV","submitted_at":"2026-05-12T08:08:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CaC is a hierarchical spatiotemporal concentrating reward model for video anomalies that reports 25.7% accuracy gains on fine-grained benchmarks and 11.7% anomaly reduction in generated videos via a new dataset and GRPO training with temporal/spatial IoU rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11505","ref_index":13,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Selective Off-Policy Reference Tuning with Plan Guidance","primary_cat":"cs.AI","submitted_at":"2026-05-12T04:25:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SORT turns all-wrong prompts into selective learning signals by weighting tokens more predictable under plan guidance from reference solutions, improving over GRPO on reasoning benchmarks especially for weaker models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11277","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sieve: Dynamic Expert-Aware PIM Acceleration for Evolving Mixture-of-Experts Models","primary_cat":"cs.AR","submitted_at":"2026-05-11T22:00:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Sieve dynamically schedules MoE experts across GPU and PIM hardware to handle bimodal token distributions, achieving 1.3x to 1.6x gains in throughput and interactivity over static prior PIM systems on three large models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10810","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Likelihood scoring for continuations of mathematical text: a self-supervised benchmark with tests for shortcut vulnerabilities","primary_cat":"cs.LG","submitted_at":"2026-05-11T16:32:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new benchmark uses separate predictor and scorer LLMs to test whether forecast strings improve likelihood of hidden mathematical equation continuations, with controls that detect priming shortcuts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10787","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ComplexMCP: Evaluation of LLM Agents in Dynamic, Interdependent, and Large-Scale Tool Sandbox","primary_cat":"cs.AI","submitted_at":"2026-05-11T16:20:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ComplexMCP benchmark shows current LLM agents achieve at most 60% success on interdependent tool tasks versus 90% for humans, due to tool retrieval saturation, over-confidence, and strategic defeatism.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10468","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can Muon Fine-tune Adam-Pretrained Models?","primary_cat":"cs.LG","submitted_at":"2026-05-11T12:34:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Constraining fine-tuning updates with LoRA mitigates performance degradation when switching from Adam to Muon on pretrained models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09808","ref_index":90,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Quantifying the Utility of User Simulators for Building Collaborative LLM Assistants","primary_cat":"cs.CL","submitted_at":"2026-05-10T23:06:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Fine-tuned simulators grounded in real human data produce LLM assistants that win more often against real users than those trained against role-playing simulators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09614","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reflection Anchors for Propagation-Aware Visual Retention in Long-Chain Multimodal Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-10T15:53:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RAPO uses an information-theoretic lower bound on visual gain to select high-entropy reflection anchors and optimizes a chain-masked KL surrogate, delivering gains over baselines on reasoning benchmarks across LVLM backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09463","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Position Bias: Shifting Context Compression from Position-Driven to Semantic-Driven","primary_cat":"cs.CL","submitted_at":"2026-05-10T10:27:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SeCo performs semantic-driven context compression for LLMs by anchoring on query-relevant semantic centers and applying consistency-weighted token merging, yielding better downstream performance, lower latency, and stronger out-of-domain robustness than position-based methods across 14 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09146","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Thinking: Imagining in 360$^\\circ$ for Humanoid Visual Search","primary_cat":"cs.CV","submitted_at":"2026-05-09T20:10:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Imagining in 360° decouples visual search into a single-step probabilistic semantic layout predictor and an actor, removing the need for multi-turn CoT reasoning and trajectory annotations while improving efficiency in 360° environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08678","ref_index":96,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MLS-Bench: A Holistic and Rigorous Assessment of AI Systems on Building Better AI","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:29:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MLS-Bench shows that current AI agents fall short of reliably inventing generalizable ML methods, with engineering tuning easier than genuine invention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08666","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Cancellation Hypothesis in Critic-Free RL: From Outcome Rewards to Token Credits","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:07:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The cancellation hypothesis shows how rollout-level rewards produce token-level credit assignment in critic-free RL through cancellation of opposing signals on shared tokens, with empirical support and batching interventions that enhance performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08639","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReLibra: Routing-Replay-Guided Load Balancing for MoE Training in Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-09T03:18:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ReLibra uses pre-known token-to-expert routing from RL rollouts to perform inter-batch expert reordering and intra-batch replication, delivering up to 1.6x higher throughput than Megatron-LM and 1.2x over oracle-equipped EPLB while staying within 6-10% of an ideal balanced baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08560","ref_index":88,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ZAYA1-VL-8B Technical Report","primary_cat":"cs.CV","submitted_at":"2026-05-08T23:41:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ZAYA1-VL-8B is a new MoE vision-language model with vision-specific LoRA adapters and bidirectional image attention that reports competitive performance against several 3B-4B models on image, reasoning, and counting benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08498","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MathConstraint: Automated Generation of Verified Combinatorial Reasoning Instances for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-08T21:28:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MathConstraint generates scalable, automatically verifiable combinatorial problems where LLMs achieve 18.5-66.9% accuracy without tools but roughly double that with solver access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08455","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CUDABeaver: Benchmarking LLM-Based Automated CUDA Debugging","primary_cat":"cs.LG","submitted_at":"2026-05-08T20:24:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CUDABeaver shows LLM CUDA debuggers often degenerate code for test-passing at the cost of speed, with protocol-aware metrics shifting success rates by up to 40 percentage points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08382","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SecureForge: Finding and Preventing Vulnerabilities in LLM-Generated Code via Prompt Optimization","primary_cat":"cs.CR","submitted_at":"2026-05-08T18:40:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SecureForge audits LLM code for vulnerabilities, builds a synthetic prompt corpus via Markovian sampling, and optimizes system prompts to cut security issues by up to 48% while preserving unit test performance, with zero-shot transfer to real prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07815","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OrScale: Orthogonalised Optimization with Layer-Wise Trust-Ratio Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-08T14:47:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OrScale adds a Frobenius-norm trust-ratio layer-wise scaler to Muon’s orthogonalized updates, with per-layer calibration for language models, yielding higher CIFAR-10 accuracy and better language-model pre-training loss than Muon+Moonlight and AdamW.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08310","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WebTrap: Stealthy Mid-Task Hijacking of Browser Agents During Navigation","primary_cat":"cs.CR","submitted_at":"2026-05-08T14:06:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WebTrap uses multi-step instruction fusion and context-grounded generation to stealthily hijack browser agents mid-navigation while preserving original task success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08283","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HTPO: Towards Exploration-Exploitation Balanced Policy Optimization via Hierarchical Token-level Objective Control","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:38:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HTPO introduces hierarchical token-level objective control in RLVR to balance exploration and exploitation by grouping tokens according to difficulty, correctness, and entropy, yielding up to 8.6% gains on AIME benchmarks over DAPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07250","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hard to Read, Easy to Jailbreak: How Visual Degradation Bypasses MLLM Safety Alignment","primary_cat":"cs.CV","submitted_at":"2026-05-08T05:19:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Degraded image resolution in MLLMs bypasses safety alignments via cognitive overload, raising jailbreak rates across perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06665","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniPool: A Globally Shared Expert Pool for Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A shared global expert pool in MoE improves validation loss over per-layer experts and allows sublinear expert-parameter growth with depth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06615","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:32:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SignSGD provably beats SGD by a factor of d under sparse noise via matched ℓ1-norm upper and lower bounds, with an equivalent result for Muon on matrices, and this predicts faster GPT-2 pretraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06523","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Implicit Reward Overfitting and the Low-rank Dynamics in RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:30:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RLVR exhibits implicit reward overfitting to training data and optimizes heavy-tailed singular spectra with rank-1 focus on reasoning capability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06501","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cubit: Token Mixer with Kernel Ridge Regression","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:18:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Cubit replaces Transformer attention with Kernel Ridge Regression token mixing and shows potential gains on longer sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08247","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM Translation of Compiler Intermediate Representation","primary_cat":"cs.PL","submitted_at":"2026-05-07T13:22:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"IRIS-14B is the first LLM trained explicitly for GIMPLE-to-LLVM IR translation and outperforms much larger models by up to 44 percentage points on real-world C code.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06230","ref_index":45,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Safactory: A Scalable Agentic Infrastructure for Training Trustworthy Autonomous Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-07T13:21:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Safactory integrates three platforms for simulation, data management, and agent evolution to create a unified pipeline for training trustworthy autonomous AI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05846","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LoopTrap: Termination Poisoning Attacks on LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-05-07T08:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LoopTrap is an automated red-teaming framework that crafts termination-poisoning prompts to amplify LLM agent steps by 3.57x on average (up to 25x) across 8 agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05365","ref_index":136,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ZAYA1-8B Technical Report","primary_cat":"cs.AI","submitted_at":"2026-05-06T18:44:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZAYA1-8B is a reasoning MoE model with 700M active parameters that matches larger models on math and coding benchmarks and reaches 91.9% on AIME'25 via Markovian RSA test-time compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05049","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Piper: Efficient Large-Scale MoE Training via Resource Modeling and Pipelined Hybrid Parallelism","primary_cat":"cs.DC","submitted_at":"2026-05-06T15:47:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Piper introduces resource modeling and pipelined hybrid parallelism for MoE training, delivering 2-3.5X higher MFU than prior frameworks and 1.2-9X better all-to-all bandwidth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05267","ref_index":118,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging Generation and Training: A Systematic Review of Quality Issues in LLMs for Code","primary_cat":"cs.SE","submitted_at":"2026-05-06T09:38:31+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A review of 114 studies creates taxonomies for code and data quality issues, formalizes 18 propagation mechanisms from training data defects to LLM-generated code defects, and synthesizes detection and mitigation techniques.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"harmful, biased, or illegal content, which may lead to legal lawsuits, ethical disputes, or privacy breaches. These issues can be further categorized into specific types: Manuscript submitted to ACM 14 K. He, X. Zhang, P. Cai, M. Liu, Y. Wang, C. Wang, K. Huang, B. Chen, X. Peng, and Z. Zheng Generated Code Quality Issues Correctness LLMs Meet Library Evolution[125], Copilot Evaluation[88], HalluCode[70], CodeHalu[118], Mercury[22], SStuBs[50], Package Hallucinations[113], HallTrigger[103], The Counterfeit Conundrum[32], Bugs in LLM-Generated Code[115], GitHub Copilot, Amazon CodeWhisperer, ChatGPT[142], ChatGPT Code Quality[77], CloudAPIBench[49], CodeMirage[3], AutoAPIEval[136], From Effectiveness to Efficiency[53], Software Librarian[57], Codequal Analyzer[105], Artificial-Intelligence Generated Code Considered"},{"citing_arxiv_id":"2605.02821","ref_index":7,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Is the Same Model Not the Same Service? A Measurement Study of Hosted Open-Weight LLM APIs","primary_cat":"cs.PF","submitted_at":"2026-05-04T16:59:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Hosted open-weight LLM APIs function as time-varying heterogeneous services rather than fixed model artifacts, with demand concentrated, supply-use mismatches, and task-specific routing yielding major cost and throughput gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02572","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"On Training Large Language Models for Long-Horizon Tasks: An Empirical Study of Horizon Length","primary_cat":"cs.AI","submitted_at":"2026-05-04T13:25:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Longer action horizons bottleneck LLM agent training through instability, but training with reduced horizons stabilizes learning and enables better generalization to longer horizons.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01761","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"TrajShield: Trajectory-Level Safety Mediation for Defending Text-to-Video Models Against Jailbreak Attacks","primary_cat":"cs.CV","submitted_at":"2026-05-03T07:49:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TrajShield is a training-free defense that reduces jailbreak success rates by 52.44% on average in text-to-video models by localizing and neutralizing risks through trajectory simulation and causal intervention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01333","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OralMLLM-Bench: Evaluating Cognitive Capabilities of Multimodal Large Language Models in Dental Practice","primary_cat":"cs.CL","submitted_at":"2026-05-02T09:08:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OralMLLM-Bench reveals performance gaps between multimodal large language models and clinicians on cognitive tasks for dental radiographic analysis across periapical, panoramic, and cephalometric images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00342","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making Every Verified Token Count: Adaptive Verification for MoE Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-05-01T01:52:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EVICT adaptively truncates draft trees in MoE speculative decoding by combining drafter signals with profiled costs to retain only cost-effective prefixes, delivering up to 2.35x speedup over autoregressive decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00254","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Network Topologies for Cost-Effective Mixture-of-Experts LLM Serving","primary_cat":"cs.NI","submitted_at":"2026-04-30T21:35:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Switchless topologies such as 3D full-mesh are 20.6-56.2% more cost-effective than scale-up networks for MoE LLM serving, with current link bandwidths over-provisioned by up to 27%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27928","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training-Free Tunnel Defect Inspection and Engineering Interpretation via Visual Recalibration and Entity Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-04-30T14:31:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TunnelMIND recalibrates language-guided defect proposals via dense visual consistency and reconstructs them into structured defect entities with attributes for severity grading and retrieval-grounded engineering reports, reporting F1 scores of 0.68, 0.78, and 0.72 on visible, GPR, and road defect任务.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26103","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AMMA: A Multi-Chiplet Memory-Centric Architecture for Low-Latency 1M Context Attention Serving","primary_cat":"cs.AR","submitted_at":"2026-04-28T20:36:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AMMA is a memory-centric multi-chiplet architecture using HBM-PNM cubes, custom logic dies, hybrid parallelism, and reordered collectives that delivers 15.5X lower attention latency and 6.9X lower energy than NVIDIA H100 for 1M context serving.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25847","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Soliloquy to Agora: Memory-Enhanced LLM Agents with Decentralized Debate for Optimization Modeling","primary_cat":"math.OC","submitted_at":"2026-04-28T16:53:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agora-Opt uses decentralized debate among LLM agent teams plus a read-write memory bank to produce more accurate optimization models from text than prior LLM methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24583","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving Vision-language Models with Perception-centric Process Reward Models","primary_cat":"cs.CV","submitted_at":"2026-04-27T15:08:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Perceval is a perception-centric PRM that detects token-level perceptual errors in VLMs, supporting token-advantage RL training and iterative test-time scaling for improved reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24126","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Psychologically-Grounded Graph Modeling for Interpretable Depression Detection","primary_cat":"cs.CL","submitted_at":"2026-04-27T07:27:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PsyGAT structures conversations as dynamic temporal graphs with Psychological Expression Units and persona augmentation to reach state-of-the-art Macro F1 scores of 89.99 and 71.37 on DAIC-WoZ and E-DAIC while adding causal interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}