{"total":13,"items":[{"citing_arxiv_id":"2605.30202","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Dual-Path Architecture for Scaling Compute and Capacity in LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-28T16:41:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dual-path blocks with deep shared and wide non-shared sublayers plus per-token gates outperform iso-FLOP baselines on language modeling while using fewer parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28919","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CosmicFish-HRM: Adaptive Reasoning via Hierarchical Recurrent Mechanisms in Compact Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-27T17:59:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Presents CosmicFish-HRM, a compact LM using hierarchical recurrent reasoning to adapt computation depth per input.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27696","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Structure over Pixels: Learning Variable-Length Visual Programs","primary_cat":"cs.CV","submitted_at":"2026-05-26T21:16:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"STROP learns variable-length discrete visual programs for images by training a length head against frozen DINOv3 features in a four-phase curriculum while bypassing pixel reconstruction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23872","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Training-Free Looped Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-22T17:31:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Training-free looped transformers retrofit recurrence to frozen models via damped ODE sub-steps on mid-stack blocks, yielding gains such as +2.64 pp on MMLU-Pro for Qwen3-4B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25299","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Thinking Pixel: Recursive Sparse Reasoning in Multimodal Diffusion Latents","primary_cat":"cs.CV","submitted_at":"2026-04-28T07:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A recursive sparse MoE framework integrated into diffusion models iteratively refines visual tokens via gated module selection to improve structured reasoning and image generation performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22110","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Do Not Imitate, Reinforce: Iterative Classification via Belief Refinement","primary_cat":"cs.LG","submitted_at":"2026-04-23T23:06:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RIC replaces single-pass label imitation with RL-driven iterative belief refinement, recovering cross-entropy optima while enabling adaptive halting via a value function.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21999","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Universal Transformers Need Memory: Depth-State Trade-offs in Adaptive Recursive Reasoning","primary_cat":"cs.LG","submitted_at":"2026-04-23T18:30:01+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Memory tokens are required for non-trivial performance in adaptive Universal Transformers on Sudoku-Extreme, with 8-32 tokens yielding stable 57% exact-match accuracy while trading off against ponder depth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15259","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stability and Generalization in Looped Transformers","primary_cat":"cs.LG","submitted_at":"2026-04-16T17:35:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Looped transformers with recall and outer normalization produce reachable, input-dependent fixed points with stable gradients, enabling generalization, while those without recall cannot; a new internal recall variant performs competitively or better.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11791","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Mechanistic Analysis of Looped Reasoning Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-13T17:55:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Looped LLMs converge to distinct cyclic fixed points per layer, repeating feedforward-style inference stages across recurrences.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"attention mechanism, as a map 𝑓 : R𝑇 ×𝐷 → R𝑇 ×𝐷: Attn(X) = softmax \u0012 XW 𝑄W ⊤ 𝐾 X ⊤ √ 𝑑 \u0013 XW 𝑉 , (1) = softmax( 𝐴(X))XW 𝑉 , (2) where W𝑄, W𝐾 , W𝑉 ∈ R𝐷×𝑑 are projection matrices and 𝐴 is defined for convenience. A transformer block typically comprises an attention mech- anism and a position-wise MLP as follows: ˆX = 𝑛2 (X + Attn(𝑛1(X)) , (3) X ′ = 𝑛4 \u0010 ˆX + MLP(𝑛3 ( ˆX)) \u0011 , (4) where 𝑛1, 𝑛2, 𝑛3, 𝑛4 are each optional norms - here we are borrowing from the notation of Geiping et al. (2025). We denote the action of a Transformer block B : R𝑇 ×𝐷 → R𝑇 ×𝐷 as X ′ = B(X), and refer to the intermediate hidden- state matrices X between blocks as the residual stream. Looped Transformers are Transformers that utilize \"recur- rence in depth\" - that is, they reapply layers to repeatedly"},{"citing_arxiv_id":"2602.13215","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When to Think Fast and Slow? AMOR: Adaptive Entropy Gate for Hybrid Models","primary_cat":"cs.AI","submitted_at":"2026-01-22T17:19:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AMOR uses output entropy to gate attention in recurrent hybrids, matching full attention performance at roughly 22% attention invocations across 180M-1.5B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.25741","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scaling Latent Reasoning via Looped Language Models","primary_cat":"cs.CL","submitted_at":"2025-10-29T17:45:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Looped language models with latent iterative computation and entropy-regularized depth allocation achieve performance matching up to 12B standard LLMs through superior knowledge manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.21734","ref_index":92,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Hierarchical Reasoning Model","primary_cat":"cs.AI","submitted_at":"2025-06-26T19:39:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HRM is a recurrent architecture with high-level planning and low-level execution modules that reaches near-perfect accuracy on complex Sudoku, maze navigation, and ARC benchmarks using 27M parameters and 1000 samples without pre-training or CoT supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.09085","ref_index":143,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Galactica: A Large Language Model for Science","primary_cat":"cs.CL","submitted_at":"2022-11-16T18:06:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Galactica, a science-specialized LLM, reports higher scores than GPT-3, Chinchilla, and PaLM on LaTeX knowledge, mathematical reasoning, and medical QA benchmarks while outperforming general models on BIG-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}