{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:K4WTNDJ2XZEVFFVWEQNBMGTG7S","short_pith_number":"pith:K4WTNDJ2","schema_version":"1.0","canonical_sha256":"572d368d3abe495296b6241a161a66fcb5d9682b3ecad956a7b480b262154e73","source":{"kind":"arxiv","id":"2605.30907","version":1},"attestation_state":"computed","paper":{"title":"BlueFin: Benchmarking LLM Agents on Financial Spreadsheets","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.SE","authors_text":"Anoushka Mohta, Case Winter, Clara Na, Colton Moraine, Emma Strubell, George Fang, John Ling, Srivatsa Kundurthy, Zach Kirshner","submitted_at":"2026-05-29T06:43:23Z","abstract_excerpt":"We present BlueFin, a benchmark that tasks large language model (LLM) agents with synthesis, manipulation, and comprehension tasks over spreadsheet workbooks in the professional finance domain. Though estimates of the global population of paying users of spreadsheet software range in the hundreds of millions -- an order of magnitude more than the estimated global population of professional developers -- comparatively fewer resources have been devoted to exploring and expanding LLM capabilities in the spreadsheet domain, with fewer still dedicated to mirroring real occupational tasks encountere"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.30907","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-29T06:43:23Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"title_canon_sha256":"958b70d8fa3ff544efa648659d3561e713f08d7467ff1a4ae468d6db43db9996","abstract_canon_sha256":"0738148ac6fe72e8121775a53b4202289357849f4066b07e7659aa7fe5ce2d7a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-01T01:03:25.038895Z","signature_b64":"6uVihMHcIpM3gZK/Z1JDyQw6utiiaiGtxwhLs7TsA9ZNROsYGCmuK674U9vT23VjyUTU5YSZPg2d1LA55GKPBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"572d368d3abe495296b6241a161a66fcb5d9682b3ecad956a7b480b262154e73","last_reissued_at":"2026-06-01T01:03:25.038225Z","signature_status":"signed_v1","first_computed_at":"2026-06-01T01:03:25.038225Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"BlueFin: Benchmarking LLM Agents on Financial Spreadsheets","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.SE","authors_text":"Anoushka Mohta, Case Winter, Clara Na, Colton Moraine, Emma Strubell, George Fang, John Ling, Srivatsa Kundurthy, Zach Kirshner","submitted_at":"2026-05-29T06:43:23Z","abstract_excerpt":"We present BlueFin, a benchmark that tasks large language model (LLM) agents with synthesis, manipulation, and comprehension tasks over spreadsheet workbooks in the professional finance domain. Though estimates of the global population of paying users of spreadsheet software range in the hundreds of millions -- an order of magnitude more than the estimated global population of professional developers -- comparatively fewer resources have been devoted to exploring and expanding LLM capabilities in the spreadsheet domain, with fewer still dedicated to mirroring real occupational tasks encountere"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.30907","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.30907/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.30907","created_at":"2026-06-01T01:03:25.038331+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.30907v1","created_at":"2026-06-01T01:03:25.038331+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.30907","created_at":"2026-06-01T01:03:25.038331+00:00"},{"alias_kind":"pith_short_12","alias_value":"K4WTNDJ2XZEV","created_at":"2026-06-01T01:03:25.038331+00:00"},{"alias_kind":"pith_short_16","alias_value":"K4WTNDJ2XZEVFFVW","created_at":"2026-06-01T01:03:25.038331+00:00"},{"alias_kind":"pith_short_8","alias_value":"K4WTNDJ2","created_at":"2026-06-01T01:03:25.038331+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S","json":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S.json","graph_json":"https://pith.science/api/pith-number/K4WTNDJ2XZEVFFVWEQNBMGTG7S/graph.json","events_json":"https://pith.science/api/pith-number/K4WTNDJ2XZEVFFVWEQNBMGTG7S/events.json","paper":"https://pith.science/paper/K4WTNDJ2"},"agent_actions":{"view_html":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S","download_json":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S.json","view_paper":"https://pith.science/paper/K4WTNDJ2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.30907&json=true","fetch_graph":"https://pith.science/api/pith-number/K4WTNDJ2XZEVFFVWEQNBMGTG7S/graph.json","fetch_events":"https://pith.science/api/pith-number/K4WTNDJ2XZEVFFVWEQNBMGTG7S/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S/action/timestamp_anchor","attest_storage":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S/action/storage_attestation","attest_author":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S/action/author_attestation","sign_citation":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S/action/citation_signature","submit_replication":"https://pith.science/pith/K4WTNDJ2XZEVFFVWEQNBMGTG7S/action/replication_record"}},"created_at":"2026-06-01T01:03:25.038331+00:00","updated_at":"2026-06-01T01:03:25.038331+00:00"}