{"total":28,"items":[{"citing_arxiv_id":"2605.12087","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Intermediate Artifacts as First-Class Citizens: A Data Model for Durable Intermediate Artifacts in Agentic Systems","primary_cat":"cs.AI","submitted_at":"2026-05-12T13:09:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A systems-level data model for preserving typed, addressable, versioned, and dependency-aware intermediate artifacts in agentic AI systems to improve long-term inspectability and maintainability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11746","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Reasoning Traces Become Performative: Step-Level Evidence that Chain-of-Thought Is an Imperfect Oversight Channel","primary_cat":"cs.AI","submitted_at":"2026-05-12T08:24:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoT traces align with internal answer commitment in only 61.9% of steps on average, dominated by confabulated continuations after commitment has stabilized.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10410","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Equilibrium Residuals Expose Three Regimes of Matrix-Game Strategic Reasoning in Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-11T11:49:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs rely on semantic cues for matrix-game equilibria but can acquire approximate computation via residual training on small instances, with a Lipschitz proof enabling transfer to larger anonymous games.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06819","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Theory of Online Learning with Autoregressive Chain-of-Thought Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-07T18:21:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Online mistake bounds for autoregressive output learning can grow logarithmically with generation horizon M under end-to-end feedback but become independent of M with chain-of-thought trajectory access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06365","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Agent Loops to Deterministic Graphs: Execution Lineage for Reproducible AI-Native Work","primary_cat":"cs.AI","submitted_at":"2026-05-07T14:39:37+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Execution lineage models AI-native work as a DAG of computations with explicit dependencies, achieving perfect state preservation in controlled update tasks where loop-based agents introduce churn and contamination.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04243","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Temporal Reasoning Is Not the Bottleneck: A Probabilistic Inconsistency Framework for Neuro-Symbolic QA","primary_cat":"cs.AI","submitted_at":"2026-05-05T19:30:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Temporal reasoning is not the core bottleneck for LLMs on time-based QA; the real issue is unstructured text-to-event mapping, addressed by a neuro-symbolic system with PIS that reaches 100% accuracy on benchmarks when representations are correct.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25166","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training Transformers as a Universal Computer","primary_cat":"cs.AI","submitted_at":"2026-04-28T03:15:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A transformer trained on random meaningless MicroPy programs generalizes to execute diverse human-written programs, providing empirical evidence it can act as a universal computer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24645","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"K-MetBench: A Multi-Dimensional Benchmark for Fine-Grained Evaluation of Expert Reasoning, Locality, and Multimodality in Meteorology","primary_cat":"cs.CL","submitted_at":"2026-04-27T16:13:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"K-MetBench shows LLMs have large gaps in interpreting meteorology diagrams and Korean-specific context, with smaller local models beating much larger global ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17884","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SPREG: Structured Plan Repair with Entropy-Guided Test-Time Intervention for Large Language Model Reasoning","primary_cat":"cs.AI","submitted_at":"2026-04-20T06:55:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SPREG detects logical failures in LLM long-chain reasoning through real-time entropy spikes and performs structured plan repairs using historical distributions, reporting a 20% absolute accuracy gain on AIME25.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17433","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Consistency from Only Two Samples: CoT-PoT Ensembling for Efficient LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-04-19T13:26:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CoT-PoT ensembling achieves self-consistency accuracy in LLMs with only two samples for 78.6% of tasks, reducing computation by 9.3x compared to standard methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12913","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CoDe-R: Refining Decompiler Output with LLMs via Rationale Guidance and Adaptive Inference","primary_cat":"cs.SE","submitted_at":"2026-04-14T15:58:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoDe-R refines LLM decompiler output via rationale-guided semantic injection and dynamic fallback inference, making a 1.3B model the first to exceed 50% average re-executability on HumanEval-Decompile.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.29069","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Mirage of Long-Range Dependency, with an Application to Integer Multiplication","primary_cat":"cs.LG","submitted_at":"2026-03-30T23:15:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Long-range dependency in integer multiplication is a mirage from 1D representation; a 2D grid reduces it to local 3x3 operations, letting a 321-parameter neural cellular automaton generalize perfectly to inputs 683 times longer than training while Transformers fail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.27343","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WMF-AM: Probing LLM Working Memory via Depth-Parameterized Cumulative State Tracking","primary_cat":"cs.AI","submitted_at":"2026-03-28T17:25:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WMF-AM is a depth-parameterized benchmark that measures LLMs' cumulative state tracking ability without scratchpads, validated on 28 models across arithmetic and non-arithmetic tasks with ablations confirming the construct.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.18864","ref_index":149,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards an AI co-scientist","primary_cat":"cs.AI","submitted_at":"2025-02-26T06:17:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-agent AI system generates novel biomedical hypotheses that show promising experimental validation in drug repurposing for leukemia, new targets for liver fibrosis, and a bacterial gene transfer mechanism.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07974","ref_index":288,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","primary_cat":"cs.SE","submitted_at":"2024-03-12T17:58:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiveCodeBench collects 400 recent contest problems to create a contamination-free benchmark evaluating LLMs on code generation and related capabilities like self-repair and execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.07927","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Systematic Survey of Prompt Engineering in Large Language Models: Techniques and Applications","primary_cat":"cs.AI","submitted_at":"2024-02-05T19:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A systematic survey categorizes prompt engineering methods for LLMs and VLMs by application area, summarizing methodologies, applications, models, datasets, strengths, and limitations for each technique along with a taxonomy and summary table.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.10774","ref_index":95,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads","primary_cat":"cs.LG","submitted_at":"2024-01-19T15:48:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Medusa augments LLMs with multiple decoding heads and tree-based attention to predict and verify several tokens in parallel, yielding 2.2-3.6x inference speedup via two fine-tuning regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.20050","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Let's Verify Step by Step","primary_cat":"cs.LG","submitted_at":"2023-05-31T17:24:00+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Process supervision significantly outperforms outcome supervision for training models on the MATH dataset, achieving 78% accuracy on a representative test subset with active learning and a released 800k step-label dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.14325","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Improving Factuality and Reasoning in Language Models through Multiagent Debate","primary_cat":"cs.CL","submitted_at":"2023-05-23T17:55:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multiagent debate among LLMs improves mathematical reasoning, strategic reasoning, and factual accuracy while reducing hallucinations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.10403","ref_index":235,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PaLM 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2023-05-17T17:46:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PaLM 2 reports state-of-the-art results on language, reasoning, and multilingual tasks with improved efficiency over PaLM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2304.05128","ref_index":114,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Teaching Large Language Models to Self-Debug","primary_cat":"cs.CL","submitted_at":"2023-04-11T10:43:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Self-Debugging teaches LLMs to identify and fix their own code errors through rubber-duck-style natural language explanations and execution feedback, delivering 2-12% gains over baselines on Spider, TransCoder, and MBPP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.12588","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks","primary_cat":"cs.CL","submitted_at":"2022-11-22T21:06:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PoT prompting improves numerical reasoning by having language models write programs executed by a computer instead of performing calculations in natural language chains of thought, with an average 12% gain over CoT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.09261","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them","primary_cat":"cs.CL","submitted_at":"2022-10-17T17:08:26+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Chain-of-thought prompting enables large language models to surpass average human performance on 17 of 23 challenging BIG-Bench tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2207.05608","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Inner Monologue: Embodied Reasoning through Planning with Language Models","primary_cat":"cs.RO","submitted_at":"2022-07-12T15:20:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs form an inner monologue from closed-loop language feedback to improve high-level instruction completion in simulated and real robotic rearrangement and kitchen manipulation tasks.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Gu, M. Reid, Y . Matsuo, and Y . Iwasawa. Large language models are zero-shot reasoners. arXiv preprint arXiv:2205.11916, 2022. [12] A. K. Lampinen, I. Dasgupta, S. C. Chan, K. Matthewson, M. H. Tessler, A. Creswell, J. L. McClelland, J. X. Wang, and F. Hill. Can language models learn from explanations in context?arXiv preprint arXiv:2204.02329, 2022. [13] M. Nye, A. J. Andreassen, G. Gur-Ari, H. Michalewski, J. Austin, D. Bieber, D. Dohan, A. Lewkowycz, M. Bosma, D. Luan, et al. Show your work: Scratchpads for intermediate computation with language models. arXiv preprint arXiv:2112.00114, 2021. [14] L. S. Vygotsky. Thought and language. MIT press, 2012. [15] P . Carruthers. Thinking in language?: evolution and a modularist possibility."},{"citing_arxiv_id":"2207.05221","ref_index":203,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Language Models (Mostly) Know What They Know","primary_cat":"cs.CL","submitted_at":"2022-07-11T22:59:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Language models show good calibration when asked to estimate the probability that their own answers are correct, with performance improving as models get larger.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2206.07682","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Emergent Abilities of Large Language Models","primary_cat":"cs.CL","submitted_at":"2022-06-15T17:32:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emergent abilities are capabilities present in large language models but absent in smaller ones and cannot be predicted by extrapolating smaller model performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.02311","ref_index":105,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PaLM: Scaling Language Modeling with Pathways","primary_cat":"cs.CL","submitted_at":"2022-04-05T16:11:45+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaLM 540B demonstrates continued scaling benefits by setting new few-shot SOTA results on hundreds of benchmarks and outperforming humans on BIG-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2201.11903","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2022-01-28T02:33:07+00:00","verdict":"ACCEPT","verdict_confidence":"HIGH","novelty_score":9.0,"formal_verification":"none","one_line_summary":"Chain-of-thought prompting, by including intermediate reasoning steps in few-shot examples, elicits strong reasoning abilities in large language models on arithmetic, commonsense, and symbolic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}