{"total":12,"items":[{"citing_arxiv_id":"2606.27446","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Causal Connections: Leveraging Multilingual Fine-Tuning for Financial QA@FinCausal 2026","primary_cat":"cs.CL","submitted_at":"2026-06-25T18:17:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Fine-tuned multilingual LLMs achieve top shared-task scores on financial causality extraction in English and Spanish.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27316","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLM-Based Examination of Eligibility Criteria from Securities Prospectuses at the German Central Bank","primary_cat":"cs.CL","submitted_at":"2026-06-25T17:29:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs are applied in a generative pipeline for extracting, normalizing, and interpreting eligibility criteria from securities prospectuses, achieving up to 91% precision in document-level decisions with a conservative bias.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19316","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Multi-Agent Framework for Feature-Constrained Difficulty Control in Reading Comprehension Item Generation","primary_cat":"cs.CL","submitted_at":"2026-05-19T03:52:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAFIG is a multi-agent framework that uses LLM agents and evaluators to generate reading comprehension items with significantly higher adherence to specified feature constraints than single-agent baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04641","ref_index":82,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CAST: Mitigating Object Hallucination in Large Vision-Language Models via Caption-Guided Visual Attention Steering","primary_cat":"cs.CV","submitted_at":"2026-05-06T08:32:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAST reduces object hallucination in LVLMs by 6.03% on average across five models and five benchmarks by identifying caption-sensitive attention heads and applying optimized steering directions to their outputs, with negligible added inference cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20131","ref_index":159,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Whose Story Gets Told? Positionality and Bias in LLM Summaries of Life Narratives","primary_cat":"cs.CL","submitted_at":"2026-04-22T02:58:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A proposed pipeline shows LLMs introduce detectable race and gender biases when summarizing life narratives, creating potential for representational harm in research.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17200","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Calibrating Model-Based Evaluation Metrics for Summarization","primary_cat":"cs.CL","submitted_at":"2026-04-19T02:04:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A reference-free proxy scoring framework combined with GIRB calibration produces better-aligned evaluation metrics for summarization and outperforms baselines across seven datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.11206","ref_index":101,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evalet: Evaluating Large Language Models through Functional Fragmentation","primary_cat":"cs.HC","submitted_at":"2025-09-14T10:24:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Evalet applies functional fragmentation to deliver fragment-level qualitative analysis of LLM evaluations, with a user study showing 48% more misalignment detections than holistic scoring.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.04565","ref_index":230,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Standalone LLMs to Integrated Intelligence: A Survey of Compound Al Systems","primary_cat":"cs.MA","submitted_at":"2025-06-05T02:34:43+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A survey that defines Compound AI Systems, proposes a multi-dimensional taxonomy based on component roles and orchestration strategies, reviews four foundational paradigms, and identifies key challenges for future research.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.04244","ref_index":186,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Benchmark Data Contamination of Large Language Models: A Survey","primary_cat":"cs.CL","submitted_at":"2024-06-06T16:41:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey reviewing benchmark data contamination in LLMs, its impact on evaluation, and alternative assessment approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.15391","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries","primary_cat":"cs.CL","submitted_at":"2024-01-27T11:41:48+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiHop-RAG is a new benchmark dataset demonstrating that existing retrieval-augmented generation systems perform poorly on multi-hop queries requiring retrieval and reasoning over multiple evidence pieces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.07201","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate","primary_cat":"cs.CL","submitted_at":"2023-08-14T15:13:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-agent debate among LLMs yields more reliable text evaluations than single-agent prompting by simulating collaborative human judgment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.16634","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment","primary_cat":"cs.CL","submitted_at":"2023-03-29T12:46:54+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"G-Eval uses GPT-4 with chain-of-thought and form-filling to reach 0.514 Spearman correlation with humans on summarization, beating prior NLG metrics while noting a bias toward LLM outputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}