{"total":10,"items":[{"citing_arxiv_id":"2605.05802","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Selective Rollout: Mid-Trajectory Termination for Multi-Sample Agent RL","primary_cat":"cs.LG","submitted_at":"2026-05-07T07:41:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A one-parameter early-termination gate based on mean pairwise prefix edit distance reduces wall-clock time by 10.7% and raises held-out success by 2.5 pp in GRPO on ALFWorld by cutting zero-advantage batch dilution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25299","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Thinking Pixel: Recursive Sparse Reasoning in Multimodal Diffusion Latents","primary_cat":"cs.CV","submitted_at":"2026-04-28T07:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A recursive sparse MoE framework integrated into diffusion models iteratively refines visual tokens via gated module selection to improve structured reasoning and image generation performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17574","ref_index":156,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Fine-Tuning: In-Context Learning and Chain-of-Thought for Reasoned Distractor Generation","primary_cat":"cs.CL","submitted_at":"2026-04-19T18:29:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs prompted with few-shot examples and rationales generate better reasoned distractors for MCQs than fine-tuned contrastive models across six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.21187","ref_index":95,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs","primary_cat":"cs.CL","submitted_at":"2024-12-30T18:55:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"o1-like models overthink easy tasks; self-training reduces compute use without accuracy loss on GSM8K, MATH500, GPQA, and AIME.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.03578","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM Multi-Agent Systems: Challenges and Open Problems","primary_cat":"cs.MA","submitted_at":"2024-02-05T23:06:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"The paper identifies inadequately addressed challenges in optimizing task allocation, fostering robust reasoning through debates, managing layered context, enhancing memory, and applying multi-agent systems to blockchain.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.03714","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines","primary_cat":"cs.CL","submitted_at":"2023-10-05T17:37:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"DSPy compiles short declarative programs into LM pipelines that self-optimize and outperform both standard few-shot prompting and expert-written chains on math, retrieval, and QA tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2307.13702","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Measuring Faithfulness in Chain-of-Thought Reasoning","primary_cat":"cs.AI","submitted_at":"2023-07-17T01:08:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chain-of-Thought reasoning in LLMs is often unfaithful, with models relying on it variably by task and less so as models scale larger.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2307.06435","ref_index":203,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Comprehensive Overview of Large Language Models","primary_cat":"cs.CL","submitted_at":"2023-07-12T20:01:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A survey paper providing an overview of Large Language Models, their background, and recent advances in the field.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Zero-Shot Retrieval Augmentation: This kind of augmen- tation keeps the original LLM architecture and weights unchanged and uses BM25 [202], nearest neighbors, or frozen pre-trained models like Bert [7] as a retriever. The retrieved information is provided as input to the model for response generation, shown to improve performance over LLMs without retrieval [198, 203]. In some scenarios, multiple retrieval iterations are required to complete the task. The output generated in the first iteration is forwarded to the retriever to fetch similar documents. Forward-looking active retrieval (FLARE) [197] initially generates the response and corrects the output by retrieving relevant documents if the response contains low-confidence tokens."},{"citing_arxiv_id":"2211.10435","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PAL: Program-aided Language Models","primary_cat":"cs.CL","submitted_at":"2022-11-18T18:56:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"PAL improves few-shot reasoning accuracy by having LLMs generate executable programs rather than text-based chains of thought, outperforming much larger models on math and logic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2210.11610","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Large Language Models Can Self-Improve","primary_cat":"cs.CL","submitted_at":"2022-10-20T21:53:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A 540B-parameter LLM improves reasoning performance on GSM8K, DROP, OpenBookQA, and ANLI-A3 by fine-tuning on self-generated high-confidence CoT solutions from unlabeled data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}