{"paper":{"title":"MultiZebraLogic: A Multilingual Logical Reasoning Benchmark","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Dan Saattrup Smart, Sofie Helene Bruun","submitted_at":"2025-11-05T15:34:48Z","abstract_excerpt":"We create high-quality datasets for LLM evaluation of logical reasoning skills across nine different languages, which have been manually checked by fluent speakers. The datasets consist of so-called zebra puzzles, and we analyse different ways of tuning the difficulty of the puzzles to fit modern LLMs. This includes the size of the puzzle (number of objects and number of clues), as well as a novel addition of red herring clues containing only irrelevant information. We show that presence of red herrings indeed makes the puzzles significantly harder for the models, and we find puzzle sizes 2x3 "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2511.03553","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2511.03553/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"}