{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:7SVZSUQ7MF4PZ26GDHXYAJ6PAC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a62befde1184d4744ba476d5674d42d7b793fc0a640a865d1b93aaf5d0a13e65","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","title_canon_sha256":"ca9759f9461d435960171557135a75921c683ae51bf0ca81bc5e8eb9bc47462b"},"schema_version":"1.0","source":{"id":"2606.21627","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.21627","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"arxiv_version","alias_value":"2606.21627v1","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.21627","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_12","alias_value":"7SVZSUQ7MF4P","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_16","alias_value":"7SVZSUQ7MF4PZ26G","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_8","alias_value":"7SVZSUQ7","created_at":"2026-06-23T01:13:17Z"}],"graph_snapshots":[{"event_id":"sha256:295c107e18e985fe085d45c2157fc82e2b763558e5c202dc8657c97dac977d67","target":"graph","created_at":"2026-06-23T01:13:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.21627/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"As agentic systems tackle increasingly complex multi-step tasks, evaluating their trajectories presents a major bottleneck - human annotation of a single trajectory on popular agentic benchmarks can take hours, making it difficult to scale evaluations for measuring performance or curating training data. This has driven widespread reliance on automated approaches such as LLM-as-a-judge (LLMJ) to critique agents at the process and outcome-levels at scale, however, the soundness of LLMJ critiques often goes unmeasured. Here, we introduce Counsel, the first public dataset of meta-evaluations for a","authors_text":"Antonia Calvi, Charlie Wang, Eujeong Choi, Henry Broomfield, Max Bartolo, Patrick Lewis, Roman Engeler, Sashank Pisupati","cross_cats":["cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","title":"Counsel: A Meta-Evaluation Dataset for Agentic Tasks"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.21627","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:739fdc84406d7c915adc5f17244752c7fda4c11d10e613baf67743ad6a1dbcbb","target":"record","created_at":"2026-06-23T01:13:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a62befde1184d4744ba476d5674d42d7b793fc0a640a865d1b93aaf5d0a13e65","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","title_canon_sha256":"ca9759f9461d435960171557135a75921c683ae51bf0ca81bc5e8eb9bc47462b"},"schema_version":"1.0","source":{"id":"2606.21627","kind":"arxiv","version":1}},"canonical_sha256":"fcab99521f6178fcebc619ef8027cf00a176d22e90030aee58a7c77d06a2018e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fcab99521f6178fcebc619ef8027cf00a176d22e90030aee58a7c77d06a2018e","first_computed_at":"2026-06-23T01:13:17.124252Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-23T01:13:17.124252Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"jUusALvzhf6kntnSjOLahTuKgPthXeifa1pymS7kCgYvrZy3YLDD6k48CcARUcGcuLWq4cfO5PSOPLnvNWQ8CQ==","signature_status":"signed_v1","signed_at":"2026-06-23T01:13:17.124738Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.21627","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:739fdc84406d7c915adc5f17244752c7fda4c11d10e613baf67743ad6a1dbcbb","sha256:295c107e18e985fe085d45c2157fc82e2b763558e5c202dc8657c97dac977d67"],"state_sha256":"d099ecb21b785d1c974ea42ed4d2a1fd241756450f594d2fc3d37fdde2bfc654"}