{"total":12,"items":[{"citing_arxiv_id":"2606.26698","ref_index":45,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Beyond Logical Forms: LLM-Extracted Patterns for Fallacy Classification","primary_cat":"cs.CL","submitted_at":"2026-06-25T07:30:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLM-extracted patterns merging logical structures and linguistic cues yield statistically significant gains in fallacy classification over zero-shot baselines with cross-dataset generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25462","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Optimizing Abstractive Summarization With Fine-Tuned PEGASUS","primary_cat":"cs.CL","submitted_at":"2026-06-24T06:43:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Fine-tuned PEGASUS achieves state-of-the-art ROUGE scores on XL-Sum English corpus with 4.04% ROUGE-1, 15.25% ROUGE-2, and 3.39% ROUGE-L gains over mT5 baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07190","ref_index":35,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"From Correctness to Utility: Gain-Based Prefix Evaluation for LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-06-05T11:56:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Prefix gain measured via student-model solve-rate improvement is used to train a Prefix Utility Model (PUM) that supplies stronger supervision than correctness-based process rewards for mathematical reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01806","ref_index":36,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ProbeScale: Probing Analysis to Optimize Neural Scaling Laws for Efficient Small Language Model Inference","primary_cat":"cs.CL","submitted_at":"2026-06-01T07:24:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ProbScale finds layer subsets in SLMs like RoBERTa-Large and T5-Base that cut parameters 5-10x while retaining 95-98% of original task performance by maximizing aggregated probe scores under a budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19194","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MMoA: An AI-Agent framework with recurrence for Memoried Mixure-of-Agent","primary_cat":"cs.CL","submitted_at":"2026-05-18T23:47:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"MMoA adds LSTM recurrence to Mixture-of-Agents routing, reaching 58.0% win rate on AlpacaEval 2.0 versus 59.8% for baseline MoA while cutting runtime by up to 4.6%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18122","ref_index":57,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Decisive: Guiding User Decisions with Optimal Preference Elicitation from Unstructured Documents","primary_cat":"cs.CL","submitted_at":"2026-04-20T11:42:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Decisive combines document-grounded option scoring with adaptive Bayesian preference elicitation to achieve up to 20% higher decision accuracy than LLMs and existing frameworks across domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.18796","ref_index":64,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models","primary_cat":"cs.CL","submitted_at":"2024-04-29T15:33:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A panel of smaller diverse LLMs outperforms a single large model as an evaluator of generations, showing less intra-model bias and over 7x lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07691","ref_index":147,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ORPO: Monolithic Preference Optimization without Reference Model","primary_cat":"cs.CL","submitted_at":"2024-03-12T14:34:08+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ORPO performs preference alignment during supervised fine-tuning via a monolithic odds ratio penalty, allowing 7B models to outperform larger state-of-the-art models on alignment benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.15391","ref_index":63,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries","primary_cat":"cs.CL","submitted_at":"2024-01-27T11:41:48+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiHop-RAG is a new benchmark dataset demonstrating that existing retrieval-augmented generation systems perform poorly on multi-hop queries requiring retrieval and reasoning over multiple evidence pieces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2312.13771","ref_index":94,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"AppAgent: Multimodal Agents as Smartphone Users","primary_cat":"cs.CV","submitted_at":"2023-12-21T11:52:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AppAgent lets large language models operate diverse smartphone apps via visual interactions and learns app usage from exploration or demonstrations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2312.06681","ref_index":70,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Steering Llama 2 via Contrastive Activation Addition","primary_cat":"cs.CL","submitted_at":"2023-12-09T04:40:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contrastive Activation Addition steers Llama 2 Chat by adding averaged residual-stream activation differences from contrastive example pairs to control targeted behaviors at inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.09014","ref_index":181,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ART: Automatic multi-step reasoning and tool-use for large language models","primary_cat":"cs.CL","submitted_at":"2023-03-16T01:04:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ART automatically generates multi-step reasoning programs with tool integration for LLMs, yielding substantial gains over few-shot and auto-CoT prompting on BigBench and MMLU while matching hand-crafted CoT on most tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}