{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:WPWLP2AFNH6A5CCMKTPPTQYGSQ","short_pith_number":"pith:WPWLP2AF","schema_version":"1.0","canonical_sha256":"b3ecb7e80569fc0e884c54def9c306940cc8af16666c5227b5b02cc34ae29d57","source":{"kind":"arxiv","id":"2311.11944","version":1},"attestation_state":"computed","paper":{"title":"FinanceBench: A New Benchmark for Financial Question Answering","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"Existing LLMs fail to correctly answer or refuse 81 percent of financial questions even with retrieval support.","cross_cats":["cs.AI","cs.CE","stat.ML"],"primary_cat":"cs.CL","authors_text":"Anand Kannappan, Bertie Vidgen, Douwe Kiela, Nino Scherrer, Pranab Islam, Rebecca Qian","submitted_at":"2023-11-20T17:28:02Z","abstract_excerpt":"FinanceBench is a first-of-its-kind test suite for evaluating the performance of LLMs on open book financial question answering (QA). It comprises 10,231 questions about publicly traded companies, with corresponding answers and evidence strings. The questions in FinanceBench are ecologically valid and cover a diverse set of scenarios. They are intended to be clear-cut and straightforward to answer to serve as a minimum performance standard. We test 16 state of the art model configurations (including GPT-4-Turbo, Llama2 and Claude2, with vector stores and long context prompts) on a sample of 15"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2311.11944","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2023-11-20T17:28:02Z","cross_cats_sorted":["cs.AI","cs.CE","stat.ML"],"title_canon_sha256":"0065779b111e2415d7e651f7202b8f5738a55baee518adfb61aae403754ff70d","abstract_canon_sha256":"b93fb6e2e2c745257d9732d756d1bf963b7deaf484def9c20e8cfb5acf1f0834"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.038014Z","signature_b64":"MFia6+ngR93uzN7t67acHC63WbxR/qegE83kPWGRR7wzxCaktoOp4c4Ydn1jxmSRsh2MUpT15lBfLwm8oiSRCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b3ecb7e80569fc0e884c54def9c306940cc8af16666c5227b5b02cc34ae29d57","last_reissued_at":"2026-05-17T23:38:49.037230Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.037230Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FinanceBench: A New Benchmark for Financial Question Answering","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"Existing LLMs fail to correctly answer or refuse 81 percent of financial questions even with retrieval support.","cross_cats":["cs.AI","cs.CE","stat.ML"],"primary_cat":"cs.CL","authors_text":"Anand Kannappan, Bertie Vidgen, Douwe Kiela, Nino Scherrer, Pranab Islam, Rebecca Qian","submitted_at":"2023-11-20T17:28:02Z","abstract_excerpt":"FinanceBench is a first-of-its-kind test suite for evaluating the performance of LLMs on open book financial question answering (QA). It comprises 10,231 questions about publicly traded companies, with corresponding answers and evidence strings. The questions in FinanceBench are ecologically valid and cover a diverse set of scenarios. They are intended to be clear-cut and straightforward to answer to serve as a minimum performance standard. We test 16 state of the art model configurations (including GPT-4-Turbo, Llama2 and Claude2, with vector stores and long context prompts) on a sample of 15"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"GPT-4-Turbo used with a retrieval system incorrectly answered or refused to answer 81% of questions.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 150 sampled cases are representative of the full 10,231 questions and that all questions are ecologically valid and clear-cut as stated.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"FinanceBench shows state-of-the-art LLMs incorrectly answer or refuse 81% of tested financial QA cases even with retrieval augmentation.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Existing LLMs fail to correctly answer or refuse 81 percent of financial questions even with retrieval support.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b48b9ac6e5af7e1a050bdc90e179af1126720dbe826e71ccda9cb9adb41dc064"},"source":{"id":"2311.11944","kind":"arxiv","version":1},"verdict":{"id":"89b8314f-8ece-48e2-8aff-0ad57ebbd07b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T04:55:57.594528Z","strongest_claim":"GPT-4-Turbo used with a retrieval system incorrectly answered or refused to answer 81% of questions.","one_line_summary":"FinanceBench shows state-of-the-art LLMs incorrectly answer or refuse 81% of tested financial QA cases even with retrieval augmentation.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 150 sampled cases are representative of the full 10,231 questions and that all questions are ecologically valid and clear-cut as stated.","pith_extraction_headline":"Existing LLMs fail to correctly answer or refuse 81 percent of financial questions even with retrieval support."},"references":{"count":18,"sample":[{"doi":"","year":2023,"title":"In Findings of the Association for Computational Linguistics: ACL 2023 , pages 1298–1313, Toronto, Canada","work_id":"ad9bbfb8-3f10-4795-9736-be023af2bca6","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2015,"title":"Qa dataset explosion: A taxonomy of nlp resources for question answering and reading com- prehension. ACM Comput. Surv., 55(10). Julio Cesar Salinas Alvarado, Karin Verspoor, and Tim- othy Baldwin. 20","work_id":"8905dfaf-0cfd-4189-bc14-a92cf7e722ef","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Fengbin Zhu, Wenqiang Lei, Youcheng Huang, Chao Wang, Shuo Zhang, Jiancheng Lv, Fuli Feng, and Tat- Seng Chua","work_id":"6db441c8-129b-483c-9621-db6e34c5aa09","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"fi- nacebench_id_0000","work_id":"0d05913f-497c-4f48-af62-b44c46ac9bd7","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"A value for whether it is in the eval sample of 298 cases (‘1’), in the open source sample (‘2’) or in neither (‘0’)","work_id":"8dad50c4-350a-4c1d-955f-c7957b0ac1ea","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":18,"snapshot_sha256":"21eee133a069b49858a713dd67bd0c2d839ea3f852d33bee2a7026364de429bc","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"f0f0c79e082b7907990e4ba356946b26c4e8106d80575733c348b942c73b684a"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2311.11944","created_at":"2026-05-17T23:38:49.037441+00:00"},{"alias_kind":"arxiv_version","alias_value":"2311.11944v1","created_at":"2026-05-17T23:38:49.037441+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.11944","created_at":"2026-05-17T23:38:49.037441+00:00"},{"alias_kind":"pith_short_12","alias_value":"WPWLP2AFNH6A","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"WPWLP2AFNH6A5CCM","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"WPWLP2AF","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":30,"internal_anchor_count":30,"sample":[{"citing_arxiv_id":"2605.15482","citing_title":"FINESSE-Bench: A Hierarchical Benchmark Suite for Financial Domain Knowledge and Technical Analysis in Large Language Models","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23262","citing_title":"Design and Report Benchmarks for Knowledge Work","ref_index":87,"is_internal_anchor":true},{"citing_arxiv_id":"2503.22693","citing_title":"Bridging Language Models and Financial Analysis","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17554","citing_title":"Evaluating Deep Research Agents on Expert Consulting Work: A Benchmark with Verifiers, Rubrics, and Cognitive Traps","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17962","citing_title":"FinDocMRE: A Benchmark for Document-Level Financial Multimodal Reasoning Evaluation","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17034","citing_title":"Privacy Policy Enforcement Guardrails for Data-Sensitive Retrieval-Augmented Generation","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15482","citing_title":"FINESSE-Bench: A Hierarchical Benchmark Suite for Financial Domain Knowledge and Technical Analysis in Large Language Models","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2506.08136","citing_title":"EconWebArena: Benchmarking Autonomous Agents on Economic Tasks in Realistic Web Environments","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2509.09544","citing_title":"MetaGraph: A Large-Scale Meta-Analysis of GenAI in Financial NLP (2022-2025)","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2512.13168","citing_title":"Finch: Benchmarking Finance & Accounting across Spreadsheet-Centric Enterprise Workflows","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2603.19254","citing_title":"FinReasoning: A Hierarchical Benchmark for Reliable Financial Research Reporting","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05971","citing_title":"Training Transformers for KV Cache Compressibility","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26235","citing_title":"LATTICE: Evaluating Decision Support Utility of Crypto Agents","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17979","citing_title":"Architecture Matters More Than Scale: A Comparative Study of Retrieval and Memory Augmentation for Financial QA Under SME Compute Constraints","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09611","citing_title":"Byte-Exact Deduplication in Retrieval-Augmented Generation: A Three-Regime Empirical Analysis Across Public Benchmarks","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24668","citing_title":"The Price of Agreement: Measuring LLM Sycophancy in Agentic Financial Applications","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05538","citing_title":"AgenticRAG: Agentic Retrieval for Enterprise Knowledge Bases","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06472","citing_title":"Efficient Serving for Dynamic Agent Workflows with Prediction-based KV-Cache Management","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05971","citing_title":"Training Transformers for KV Cache Compressibility","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19298","citing_title":"IndiaFinBench: An Evaluation Benchmark for Large Language Model Performance on Indian Financial Regulatory Text","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17979","citing_title":"Architecture Matters More Than Scale: A Comparative Study of Retrieval and Memory Augmentation for Financial QA Under SME Compute Constraints","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12047","citing_title":"Empirical Evaluation of PDF Parsing and Chunking for Financial Question Answering with RAG","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10015","citing_title":"FinTrace: Holistic Trajectory-Level Evaluation of LLM Tool Calling for Long-Horizon Financial Tasks","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09056","citing_title":"Conversations Risk Detection LLMs in Financial Agents via Multi-Stage Generative Rollout","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2604.04812","citing_title":"SysTradeBench: An Iterative Build-Test-Patch Benchmark for Strategy-to-Code Trading Systems with Drift-Aware Diagnostics","ref_index":20,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ","json":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ.json","graph_json":"https://pith.science/api/pith-number/WPWLP2AFNH6A5CCMKTPPTQYGSQ/graph.json","events_json":"https://pith.science/api/pith-number/WPWLP2AFNH6A5CCMKTPPTQYGSQ/events.json","paper":"https://pith.science/paper/WPWLP2AF"},"agent_actions":{"view_html":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ","download_json":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ.json","view_paper":"https://pith.science/paper/WPWLP2AF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2311.11944&json=true","fetch_graph":"https://pith.science/api/pith-number/WPWLP2AFNH6A5CCMKTPPTQYGSQ/graph.json","fetch_events":"https://pith.science/api/pith-number/WPWLP2AFNH6A5CCMKTPPTQYGSQ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ/action/storage_attestation","attest_author":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ/action/author_attestation","sign_citation":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ/action/citation_signature","submit_replication":"https://pith.science/pith/WPWLP2AFNH6A5CCMKTPPTQYGSQ/action/replication_record"}},"created_at":"2026-05-17T23:38:49.037441+00:00","updated_at":"2026-05-17T23:38:49.037441+00:00"}