{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:7SVZSUQ7MF4PZ26GDHXYAJ6PAC","short_pith_number":"pith:7SVZSUQ7","canonical_record":{"source":{"id":"2606.21627","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"ca9759f9461d435960171557135a75921c683ae51bf0ca81bc5e8eb9bc47462b","abstract_canon_sha256":"a62befde1184d4744ba476d5674d42d7b793fc0a640a865d1b93aaf5d0a13e65"},"schema_version":"1.0"},"canonical_sha256":"fcab99521f6178fcebc619ef8027cf00a176d22e90030aee58a7c77d06a2018e","source":{"kind":"arxiv","id":"2606.21627","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.21627","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"arxiv_version","alias_value":"2606.21627v1","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.21627","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_12","alias_value":"7SVZSUQ7MF4P","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_16","alias_value":"7SVZSUQ7MF4PZ26G","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_8","alias_value":"7SVZSUQ7","created_at":"2026-06-23T01:13:17Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:7SVZSUQ7MF4PZ26GDHXYAJ6PAC","target":"record","payload":{"canonical_record":{"source":{"id":"2606.21627","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"ca9759f9461d435960171557135a75921c683ae51bf0ca81bc5e8eb9bc47462b","abstract_canon_sha256":"a62befde1184d4744ba476d5674d42d7b793fc0a640a865d1b93aaf5d0a13e65"},"schema_version":"1.0"},"canonical_sha256":"fcab99521f6178fcebc619ef8027cf00a176d22e90030aee58a7c77d06a2018e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T01:13:17.124738Z","signature_b64":"jUusALvzhf6kntnSjOLahTuKgPthXeifa1pymS7kCgYvrZy3YLDD6k48CcARUcGcuLWq4cfO5PSOPLnvNWQ8CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fcab99521f6178fcebc619ef8027cf00a176d22e90030aee58a7c77d06a2018e","last_reissued_at":"2026-06-23T01:13:17.124252Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T01:13:17.124252Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.21627","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-23T01:13:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Fzebqrni+W7hLl37s2C0pIiZxWbgcjK5i/v/L21lDNDuDUKc31d7W+i0XbjtExwHP3A9ujCqC1+R6ktoq0CsAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-04T18:20:21.686917Z"},"content_sha256":"739fdc84406d7c915adc5f17244752c7fda4c11d10e613baf67743ad6a1dbcbb","schema_version":"1.0","event_id":"sha256:739fdc84406d7c915adc5f17244752c7fda4c11d10e613baf67743ad6a1dbcbb"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:7SVZSUQ7MF4PZ26GDHXYAJ6PAC","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Counsel: A Meta-Evaluation Dataset for Agentic Tasks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Antonia Calvi, Charlie Wang, Eujeong Choi, Henry Broomfield, Max Bartolo, Patrick Lewis, Roman Engeler, Sashank Pisupati","submitted_at":"2026-06-19T17:30:56Z","abstract_excerpt":"As agentic systems tackle increasingly complex multi-step tasks, evaluating their trajectories presents a major bottleneck - human annotation of a single trajectory on popular agentic benchmarks can take hours, making it difficult to scale evaluations for measuring performance or curating training data. This has driven widespread reliance on automated approaches such as LLM-as-a-judge (LLMJ) to critique agents at the process and outcome-levels at scale, however, the soundness of LLMJ critiques often goes unmeasured. Here, we introduce Counsel, the first public dataset of meta-evaluations for a"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.21627","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.21627/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-23T01:13:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"sPfFMPNTrR0Cgw5UFped8mFHBi+mpXktgPDkF4+tEmH0ha22sb1tWrB6eR0DkFkyWdOkl0KNflVdXhmo8jdaAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-04T18:20:21.687327Z"},"content_sha256":"295c107e18e985fe085d45c2157fc82e2b763558e5c202dc8657c97dac977d67","schema_version":"1.0","event_id":"sha256:295c107e18e985fe085d45c2157fc82e2b763558e5c202dc8657c97dac977d67"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7SVZSUQ7MF4PZ26GDHXYAJ6PAC/bundle.json","state_url":"https://pith.science/pith/7SVZSUQ7MF4PZ26GDHXYAJ6PAC/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7SVZSUQ7MF4PZ26GDHXYAJ6PAC/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-04T18:20:21Z","links":{"resolver":"https://pith.science/pith/7SVZSUQ7MF4PZ26GDHXYAJ6PAC","bundle":"https://pith.science/pith/7SVZSUQ7MF4PZ26GDHXYAJ6PAC/bundle.json","state":"https://pith.science/pith/7SVZSUQ7MF4PZ26GDHXYAJ6PAC/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7SVZSUQ7MF4PZ26GDHXYAJ6PAC/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:7SVZSUQ7MF4PZ26GDHXYAJ6PAC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a62befde1184d4744ba476d5674d42d7b793fc0a640a865d1b93aaf5d0a13e65","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","title_canon_sha256":"ca9759f9461d435960171557135a75921c683ae51bf0ca81bc5e8eb9bc47462b"},"schema_version":"1.0","source":{"id":"2606.21627","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.21627","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"arxiv_version","alias_value":"2606.21627v1","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.21627","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_12","alias_value":"7SVZSUQ7MF4P","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_16","alias_value":"7SVZSUQ7MF4PZ26G","created_at":"2026-06-23T01:13:17Z"},{"alias_kind":"pith_short_8","alias_value":"7SVZSUQ7","created_at":"2026-06-23T01:13:17Z"}],"graph_snapshots":[{"event_id":"sha256:295c107e18e985fe085d45c2157fc82e2b763558e5c202dc8657c97dac977d67","target":"graph","created_at":"2026-06-23T01:13:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.21627/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"As agentic systems tackle increasingly complex multi-step tasks, evaluating their trajectories presents a major bottleneck - human annotation of a single trajectory on popular agentic benchmarks can take hours, making it difficult to scale evaluations for measuring performance or curating training data. This has driven widespread reliance on automated approaches such as LLM-as-a-judge (LLMJ) to critique agents at the process and outcome-levels at scale, however, the soundness of LLMJ critiques often goes unmeasured. Here, we introduce Counsel, the first public dataset of meta-evaluations for a","authors_text":"Antonia Calvi, Charlie Wang, Eujeong Choi, Henry Broomfield, Max Bartolo, Patrick Lewis, Roman Engeler, Sashank Pisupati","cross_cats":["cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","title":"Counsel: A Meta-Evaluation Dataset for Agentic Tasks"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.21627","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:739fdc84406d7c915adc5f17244752c7fda4c11d10e613baf67743ad6a1dbcbb","target":"record","created_at":"2026-06-23T01:13:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a62befde1184d4744ba476d5674d42d7b793fc0a640a865d1b93aaf5d0a13e65","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-19T17:30:56Z","title_canon_sha256":"ca9759f9461d435960171557135a75921c683ae51bf0ca81bc5e8eb9bc47462b"},"schema_version":"1.0","source":{"id":"2606.21627","kind":"arxiv","version":1}},"canonical_sha256":"fcab99521f6178fcebc619ef8027cf00a176d22e90030aee58a7c77d06a2018e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fcab99521f6178fcebc619ef8027cf00a176d22e90030aee58a7c77d06a2018e","first_computed_at":"2026-06-23T01:13:17.124252Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-23T01:13:17.124252Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"jUusALvzhf6kntnSjOLahTuKgPthXeifa1pymS7kCgYvrZy3YLDD6k48CcARUcGcuLWq4cfO5PSOPLnvNWQ8CQ==","signature_status":"signed_v1","signed_at":"2026-06-23T01:13:17.124738Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.21627","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:739fdc84406d7c915adc5f17244752c7fda4c11d10e613baf67743ad6a1dbcbb","sha256:295c107e18e985fe085d45c2157fc82e2b763558e5c202dc8657c97dac977d67"],"state_sha256":"d099ecb21b785d1c974ea42ed4d2a1fd241756450f594d2fc3d37fdde2bfc654"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"mVyHavlbzY6Qpt2wDvZRkbHC+guYzPmDtjQKPCbQ1GWwG2T5nHWu18nCpzSYjpYyGMe1j20tmMBxdYniO0lMBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-04T18:20:21.689434Z","bundle_sha256":"15cd73926457b978284a42bf4cb077b3548372a4213bed395f6e1901b08a1e1c"}}