{"total":21,"items":[{"citing_arxiv_id":"2605.13369","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Query-Conditioned Test-Time Self-Training for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-13T11:27:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QueST lets LLMs create query-conditioned problem-solution pairs at inference time and use them for parameter-efficient self-training, outperforming prior test-time baselines on math and science benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12110","ref_index":6,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AB-Sparse: Sparse Attention with Adaptive Block Size for Accurate and Efficient Long-Context Inference","primary_cat":"cs.DC","submitted_at":"2026-05-12T13:23:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AB-Sparse adaptively allocates per-head block sizes for sparse attention, adds lossless centroid quantization and custom variable-block GPU kernels, and reports up to 5.43% accuracy gain over fixed-block baselines with no throughput loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11665","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Nautilus: From One Prompt to Plug-and-Play Robot Learning","primary_cat":"cs.RO","submitted_at":"2026-05-12T07:26:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NAUTILUS is a prompt-driven harness that automates plug-and-play adapters, typed contracts, and validation for policies, benchmarks, and robots in learning research.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10930","ref_index":49,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Evaluating the False Trust engendered by LLM Explanations","primary_cat":"cs.HC","submitted_at":"2026-05-11T17:58:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A user study finds that LLM reasoning traces and post-hoc explanations create false trust by increasing acceptance of incorrect answers, whereas contrastive dual explanations improve users' ability to detect errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09879","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"M2A: Synergizing Mathematical and Agentic Reasoning in Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-11T02:05:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"M2A uses null-space model merging to combine mathematical and agentic reasoning in LLMs, raising SWE-Bench Verified performance from 44.0% to 51.2% on Qwen3-8B without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09806","ref_index":1,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"LEAD: Length-Efficient Adaptive and Dynamic Reasoning for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-10T23:05:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LEAD uses online adaptive mechanisms including Potential-Scaled Instability and symmetric efficiency rewards based on correct rollouts to achieve higher accuracy-efficiency scores with substantially shorter reasoning outputs than base models on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09693","ref_index":16,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Do multimodal models imagine electric sheep?","primary_cat":"cs.CV","submitted_at":"2026-05-10T18:25:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fine-tuning VLMs to output action sequences for puzzles causes emergent internal visual representations that improve performance when integrated into reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09505","ref_index":52,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"EpiGraph: Building Generalists for Evidence-Intensive Epilepsy Reasoning in the Wild","primary_cat":"cs.AI","submitted_at":"2026-05-10T12:27:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EpiGraph creates a heterogeneous epilepsy knowledge graph that boosts LLM performance on clinical reasoning tasks by 30-41% in pharmacogenomics when used with Graph-RAG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09271","ref_index":45,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Shaping Schema via Language Representation as the Next Frontier for LLM Intelligence Expanding","primary_cat":"cs.AI","submitted_at":"2026-05-10T02:42:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Advanced language representations shape LLMs' schemas to improve knowledge activation and problem-solving.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Specifically, numerous studies have further unlocked the potential of LLMs by implicitly or explicitly providing or modifying schemas within them [42, 24].First of all, different content of inputs can activate distinct schemas in LLMs. For instance, in-context [43] information modulates embeddings and attention weights across layers [44], while chain-of-thought (CoT) [45] prompting elicits rea- soning capabilities, even when invalid reasoning is provided [ 46].Secondly, different languages also represent different reasoning schemas. Wang et al. [ 42] found that the model placed more attention on causes when given Chinese prompts, while it was more balanced in terms of cause and effect when given English prompts.Furthermore, both explicit and implicit schemas serve as vital"},{"citing_arxiv_id":"2605.09269","ref_index":40,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DeltaRubric: Generative Multimodal Reward Modeling via Joint Planning and Verification","primary_cat":"cs.CL","submitted_at":"2026-05-10T02:32:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeltaRubric decomposes multimodal preference evaluation into self-generated planning and verification steps within a single model, producing large accuracy improvements on VL-RewardBench via multi-role reinforcement learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10990","ref_index":29,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Skill Drift Is Contract Violation: Proactive Maintenance for LLM Agent Skill Libraries","primary_cat":"cs.SE","submitted_at":"2026-05-09T11:41:53+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SkillGuard extracts executable environment contracts from LLM skill documents to detect only relevant drifts, reporting zero false positives on 599 cases, 100% precision in known-drift tests, and raising one-round repair success from 10% to 78%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08715","ref_index":54,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AgentForesight: Online Auditing for Early Failure Prediction in Multi-Agent Systems","primary_cat":"cs.CL","submitted_at":"2026-05-09T05:55:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AgentForesight trains a 7B model to perform online auditing of multi-agent LLM trajectories, detecting early decisive errors and outperforming larger models on custom and external benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08704","ref_index":41,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AgentPSO: Evolving Agent Reasoning Skill via Multi-agent Particle Swarm Optimization","primary_cat":"cs.AI","submitted_at":"2026-05-09T05:38:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AgentPSO evolves reusable multi-agent reasoning skills via PSO-inspired natural-language updates, outperforming static agents and test-time multi-agent baselines on math and general reasoning tasks with cross-benchmark transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07830","ref_index":26,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"CyBiasBench: Benchmarking Bias in LLM Agents for Cyber-Attack Scenarios","primary_cat":"cs.CR","submitted_at":"2026-05-08T14:57:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM agents exhibit persistent attack-selection biases as fixed traits independent of success rates, with a bias momentum effect that resists steering and yields no performance gain.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07316","ref_index":1,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Implicit Compression Regularization: Concise Reasoning via Internal Shorter Distributions in RL Post-Training","primary_cat":"cs.AI","submitted_at":"2026-05-08T06:25:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ICR creates a virtual shorter distribution from shortest correct on-policy responses to regularize RL post-training toward concise yet accurate reasoning, improving the accuracy-length Pareto frontier on math and knowledge benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07129","ref_index":36,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RRCM: Ranking-Driven Retrieval over Collaborative and Meta Memories for LLM Recommendation","primary_cat":"cs.IR","submitted_at":"2026-05-08T02:07:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RRCM trains an LLM to dynamically retrieve from collaborative and meta memories using group relative policy optimization driven by final top-k recommendation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07106","ref_index":1,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Retrieve, Integrate, and Synthesize: Spatial-Semantic Grounded Latent Visual Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-08T01:33:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RIS improves MLLM latent visual reasoning by retrieving spatial-semantic evidence, integrating it via attention bottlenecks, and synthesizing it with language transition tokens, yielding gains on V*, HRBench, MMVP, and BLINK benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02674","ref_index":64,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Do Agent Societies Develop Intellectual Elites? The Hidden Power Laws of Collective Cognition in LLM Multi-Agent Systems","primary_cat":"cs.MA","submitted_at":"2026-04-03T03:08:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM agent societies develop power-law coordination cascades and intellectual elites through an integration bottleneck that grows with system size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.21776","ref_index":34,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Video-R1: Reinforcing Video Reasoning in MLLMs","primary_cat":"cs.CV","submitted_at":"2025-03-27T17:59:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Video-R1 uses temporal-aware RL and mixed datasets to boost video reasoning in MLLMs, with a 7B model reaching 37.1% on VSI-Bench and surpassing GPT-4o.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.09992","ref_index":97,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Large Language Diffusion Models","primary_cat":"cs.CL","submitted_at":"2025-02-14T08:23:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LLaDA is a scalable diffusion-based language model that matches autoregressive LLMs like LLaMA3 8B on tasks and surpasses GPT-4o on reversal poem completion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.05366","ref_index":59,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Search-o1: Agentic Search-Enhanced Large Reasoning Models","primary_cat":"cs.AI","submitted_at":"2025-01-09T16:48:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Search-o1 integrates agentic retrieval-augmented generation and a Reason-in-Documents module into large reasoning models to dynamically supply missing knowledge and improve performance on complex science, math, coding, and QA tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}