{"total":22,"items":[{"citing_arxiv_id":"2605.22536","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SpaceDG: Benchmarking Spatial Intelligence under Visual Degradation","primary_cat":"cs.CV","submitted_at":"2026-05-21T14:25:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpaceDG is the first large-scale benchmark dataset (~1M QA pairs) simulating nine visual degradations in 3DGS-rendered scenes to measure and improve spatial intelligence robustness in MLLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20948","ref_index":36,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Memory Grafting: Scaling Language Model Pre-training via Offline Conditional Memory","primary_cat":"cs.CL","submitted_at":"2026-05-20T09:35:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Memory Grafting improves language-model benchmarks by grafting offline hidden-state memory from a larger model into a recipient model using n-gram lookups and lightweight adapters, outperforming MoE and vanilla Engram baselines at 0.92B and 2.8B scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20176","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"ClinSeekAgent: Automating Multimodal Evidence Seeking for Agentic Clinical Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-19T17:58:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClinSeekAgent automates active multimodal evidence seeking for clinical reasoning, improving LLM performance on raw EHR and CXR tasks while enabling distillation into smaller models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19390","ref_index":33,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"LMM-Track4D: Eliciting 4D Dynamic Reasoning in LMMs via Trajectory-Grounded Dialogue","primary_cat":"cs.CV","submitted_at":"2026-05-19T05:35:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LMM-Track4D formulates a trajectory-grounded dialogue task, releases Track4D-Bench with 526 samples, and proposes RTGE encoding, TRK state token, and OSK-RA decoder to elicit better 4D spatiotemporal reasoning in LMMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19242","ref_index":60,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"PhyWorld: Physics-Faithful World Model for Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-19T01:28:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PhyWorld improves temporal consistency and physical plausibility in video world models via flow matching fine-tuning followed by DPO on physics preference pairs, with reported gains on VBench and a custom physical-faithfulness benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17610","ref_index":64,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SafeLens: Deliberate and Efficient Video Guardrails with Fast-and-Slow Screening","primary_cat":"cs.CV","submitted_at":"2026-05-17T19:10:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SafeLens presents a fast-and-slow video guardrail framework that filters the SafeWatch dataset to 2.4% and adds Chain-of-Thought traces to achieve state-of-the-art moderation performance at reduced inference cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17172","ref_index":73,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"OpenJarvis: Personal AI, On Personal Devices","primary_cat":"cs.LG","submitted_at":"2026-05-16T22:00:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OpenJarvis decomposes personal AI into Intelligence, Engine, Agents, Tools & Memory, and Learning primitives and applies LLM-guided spec search to produce on-device configurations that reach within 3.2 pp of cloud baselines on average across eight tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13062","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Edit-Compass & EditReward-Compass: A Unified Benchmark for Image Editing and Reward Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-13T06:33:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Edit-Compass and EditReward-Compass are new unified benchmarks for fine-grained image editing evaluation and realistic reward modeling in reinforcement learning optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11730","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Persona-Conditioned Adversarial Prompting: Multi-Identity Red-Teaming for Adversarial Discovery and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-05-12T08:12:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PCAP conditions adversarial searches on multiple attacker personas to discover more diverse and transferable jailbreaks, yielding richer safety fine-tuning datasets that boost model robustness on GPT-OSS 120B.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"prompt, target goal, and strategy tags, metadata automatically captured during generation. This rich annotation provides training signals that generalize beyond individual prompt tokens, enabling recognition of attack patterns. Fine-tuning Setup.We fine-tune ALoRA adapters [ 17] on three model families: Granite 3.3 8B, Llama 3.1 8B (bothInstruct), and Qwen 3.5 9B [ 23]. Training uses rejection labels with metadata (e.g.,I cannot reply because it asks about gi, using Σi), helping models recognize attack strategies rather than memorizing specific prompts. We evaluate on 800 prompts (400 benign from Dolly [4], 7 (a) Average prompt similarity across prompts for the same goal. Model Similarity within Goal Cos Sim Self-BLEU"},{"citing_arxiv_id":"2605.11558","ref_index":53,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"A Composite Activation Function for Learning Stable Binary Representations","primary_cat":"cs.LG","submitted_at":"2026-05-12T05:41:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HTAF is a sigmoid-tanh composite that approximates the Heaviside function to allow stable gradient training of binary activation networks, yielding ICBMs with stable discretization and competitive performance on image tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"arXiv preprint arXiv:2304.06129, 2023. 12 [51] Philipp Petersen and Felix V oigtlaender. Optimal approximation of piecewise smooth functions using deep ReLU neural networks.Neural Networks, 108:296-330, 2018. [52] Haotong Qin, Ruihao Gong, Xianglong Liu, Xiao Bai, Jingkuan Song, and Nicu Sebe. Binary neural networks: A survey.Pattern Recognition, 105:107281, 2020. [53] Qwen Team. Qwen3.5: Towards native multimodal agents, February 2026. [54] Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. Learning transferable visual models from natural language supervision. InInternational conference on machine learning, pages 8748-8763."},{"citing_arxiv_id":"2605.10782","ref_index":21,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"TrajPrism: A Multi-Task Benchmark for Language-Grounded Urban Trajectory Understanding","primary_cat":"cs.AI","submitted_at":"2026-05-11T16:17:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TrajPrism introduces a multi-task benchmark with 300K real-world urban trajectories and 2.1M language-grounded task instances across three cities, plus proof-of-concept models showing large gaps versus geometry-only baselines.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"hallucination verification, lexical diversity enforcement, and punctuation sanitization. Drawing on the LLM-as-a-judge paradigm [33], we adopt a cascaded judging strategy that combines scalable LLM scoring, cross-model validation, and human verification in the second phase, to ensure annotation quality is both measurable and reproducible across all three cities. A Qwen-based judge [21] first scores all generated data and selects the top 100K trajectories per city, a sample of 2K trajectories is then independently re-evaluated by GPT-4.19 and Gemini 2.5 Flash [6] to validate scoring consistency, and 100 randomly sampled Porto trajectories are assessed by human annotators to establish a quality baseline. All LLM and human judges share the same evaluation rubric, enabling"},{"citing_arxiv_id":"2605.10332","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"EmbodiSkill: Skill-Aware Reflection for Self-Evolving Embodied Agents","primary_cat":"cs.AI","submitted_at":"2026-05-11T10:33:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EmbodiSkill uses skill-aware reflection on execution trajectories to update skills in embodied agents, achieving 93.28% success on ALFWorld with a frozen Qwen3.5-27B model, outperforming direct GPT-5.2 use by 31.58%.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"G-Memory 74.62 62.50 77.4282.6185.71 83.33 52.94 LangMem 62.69 62.50 61.29 52.17 76.19 72.22 52.94 EmbodiSkill (Ours) EmbodiSkill w/ GPT-5.293.28 95.83 96.7773.9195.24 100.00 100.00 EmbodiSkill w/ Gemini-3-flash 87.31 95.8393.55 69.5795.2483.33 82.35 Models.For ALFWorld, we instantiate the executor 𝜋𝜃 with Qwen2.5-14B-Instruct [23] and Qwen3.5- 27B [24]. For EmbodiedBench-Habitat and EmbodiedBench-Navigation, Qwen3-VL-8B-Instruct and Qwen3-VL-32B-Instruct as executors [25]. The skill evolution model𝐹 is instantiated with GPT-5.2 [26] or Gemini-3-flash [27]. All executor parameters are kept fixed during skill evolution, so performance gains come from the evolving skill rather than model parameter updates."},{"citing_arxiv_id":"2605.09918","ref_index":27,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"NaiAD: Initiate Data-Driven Research for LLM Advertising","primary_cat":"cs.LG","submitted_at":"2026-05-11T03:11:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NaiAD is a new dataset and framework for LLM-native advertising that uses decoupled generation and calibrated scoring to identify four semantic strategies for balancing user and commercial utilities.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"BannerBench: Benchmarking vision language models for multi-ad selection with human preferences. In Christos Christodoulopoulos, Tanmoy Chakraborty, Carolyn Rose, and Violet Peng, editors,Findings of the Association for Computational Linguistics: EMNLP 2025, pages 24145-24159, Suzhou, China, November 2025. Association for Computational Linguistics. [27] Qwen Team. Qwen3.5: Towards native multimodal agents, February 2026. [28] Qwen Team. Qwen3.6-Plus: Towards real world agents, April 2026. [29] Nils Reimers and Iryna Gurevych. Sentence-BERT: Sentence embeddings using Siamese BERT- networks. In Kentaro Inui, Jing Jiang, Vincent Ng, and Xiaojun Wan, editors,Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th"},{"citing_arxiv_id":"2605.08575","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Uncovering Intra-expert Activation Sparsity for Efficient Mixture-of-Expert Model Execution","primary_cat":"cs.LG","submitted_at":"2026-05-09T00:34:55+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pre-trained MoE models exhibit up to 90% intra-expert activation sparsity that enables up to 2.5x faster MoE layer execution when exploited in the vLLM inference system.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"levels of intra-expert activation sparsity, by sorting each expert's post-activation outputs and zeroing out the lowest-scoring neurons. We do not modify any other part of the model architecture or update the model parameters in any way. We test eight MoE models as shown in Table 1 against five benchmarks, namely ARC-Challenge [7], ARC-Easy [7], HellaSwag [49], Winogrande [39], and TruthfulQA-mc2 [24], to cover a wide variety of models and benchmarks. We use lm-eval-harness [ 11] with vLLM [19] backend, and evaluate both with and without applying intra-expert sparsity to the shared experts for models with shared experts. We define the maximum sparsity at which the model retains 95% of its baseline average benchmark score as the representativesparsity cutoffvalue of the model."},{"citing_arxiv_id":"2605.06527","ref_index":33,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"STALE: Can LLM Agents Know When Their Memories Are No Longer Valid?","primary_cat":"cs.CL","submitted_at":"2026-05-07T16:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM agents struggle to detect and act on implicit memory conflicts, with top models scoring 55.2% on the new STALE benchmark of 400 scenarios; CUPMem prototype strengthens state-aware revision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06219","ref_index":33,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Joint Consistency: A Unified Test-Time Aggregation Framework via Energy Minimization","primary_cat":"cs.AI","submitted_at":"2026-05-07T13:17:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Joint Consistency casts test-time aggregation as Ising-type energy minimization with pairwise LLM-judge interactions, subsuming voting methods and outperforming baselines across reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06170","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DynT2I-Eval: A Dynamic Evaluation Framework for Text-to-Image Models","primary_cat":"cs.CV","submitted_at":"2026-05-07T12:53:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DynT2I-Eval creates fresh prompts via dimension decomposition and dynamic sampling to evaluate text-to-image models on text alignment, quality, and aesthetics while maintaining a stable leaderboard.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25256","ref_index":29,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AutoResearchBench: Benchmarking AI Agents on Complex Scientific Literature Discovery","primary_cat":"cs.AI","submitted_at":"2026-04-28T06:05:17+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"AutoResearchBench is a new benchmark showing top AI agents achieve under 10% success on complex scientific literature discovery tasks that demand deep comprehension and open-ended search.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13531","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RiskWebWorld: A Realistic Interactive Benchmark for GUI Agents in E-commerce Risk Management","primary_cat":"cs.AI","submitted_at":"2026-04-15T06:27:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RiskWebWorld is the first realistic interactive benchmark for GUI agents in e-commerce risk management, revealing a large gap between generalist and specialized models plus RL gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08644","ref_index":37,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"EXAONE 4.5 Technical Report","primary_cat":"cs.CL","submitted_at":"2026-04-09T17:51:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"EXAONE 4.5 is a new open-weight multimodal model that matches general benchmarks and outperforms similar-scale models on document understanding and Korean contextual reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07725","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Squeeze Evolve: Unified Multi-Model Orchestration for Verifier-Free Evolution","primary_cat":"cs.AI","submitted_at":"2026-04-09T02:14:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Squeeze Evolve is a multi-model orchestration framework that improves efficiency and performance in verifier-free evolutionary inference, cutting costs up to 3x while matching verifier-based methods on several benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03044","ref_index":6,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"JoyAI-LLM Flash: Advancing Mid-Scale LLMs with Token Efficiency","primary_cat":"cs.CL","submitted_at":"2026-04-03T13:52:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JoyAI-LLM Flash delivers a 48B MoE LLM with 2.7B active parameters per token via FiberPO RL and dense multi-token prediction, released with checkpoints on Hugging Face.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"JoyAI-LLM Flash achieves 1.45× and 1.07× speedups over the pure attention-based models GLM-4.7-Flash [4] and Qwen3-30B-A3B [5], respectively. In terms of multi-token prediction (MTP) efficiency, defined as the inference speedup of the MTP model over its non-MTP counterpart, JoyAI-LLM Flash achieves a 1.87× speedup, surpassing the hybrid-attention models Qwen3.5-35B-A3B [6] (1.61×) and Step-3.5-Flash [7] (1.39×). We open-source both the base and chat model weights in multiple quantization formats. The base model of JoyAI-LLM Flash was pretrained on an extensive text-only corpus of over 20 trillion tokens, employing a Warmup-Constant-Cosine-Decay learning rate schedule. To maximize token utilization and incrementally"}],"limit":50,"offset":0}