{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:W7HLAZNOWWKU4PH7EIDKLA4FLO","short_pith_number":"pith:W7HLAZNO","canonical_record":{"source":{"id":"2605.26438","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-08T00:04:19Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"46b44821d0d751c28588e0cec04fd622824a0441dceed14f04c4bbeb7373ab15","abstract_canon_sha256":"feeceac72cf133497f9c18d08abe5120f07ed02cbc159d3a33d9548f3561a31a"},"schema_version":"1.0"},"canonical_sha256":"b7ceb065aeb5954e3cff2206a583855bbda2a064bd9bd465ac2ce5ee61078aa0","source":{"kind":"arxiv","id":"2605.26438","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.26438","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"arxiv_version","alias_value":"2605.26438v1","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.26438","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"pith_short_12","alias_value":"W7HLAZNOWWKU","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"pith_short_16","alias_value":"W7HLAZNOWWKU4PH7","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"pith_short_8","alias_value":"W7HLAZNO","created_at":"2026-05-27T01:05:17Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:W7HLAZNOWWKU4PH7EIDKLA4FLO","target":"record","payload":{"canonical_record":{"source":{"id":"2605.26438","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-08T00:04:19Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"46b44821d0d751c28588e0cec04fd622824a0441dceed14f04c4bbeb7373ab15","abstract_canon_sha256":"feeceac72cf133497f9c18d08abe5120f07ed02cbc159d3a33d9548f3561a31a"},"schema_version":"1.0"},"canonical_sha256":"b7ceb065aeb5954e3cff2206a583855bbda2a064bd9bd465ac2ce5ee61078aa0","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:05:17.949897Z","signature_b64":"lC2bQ+bC0myDdztazfxFVhBBm6ba90Hj4CeqojAm8zvFUGwf7oYoiEjTP66oQpzc+gpgDB/rOtXjtb/9VeKsCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b7ceb065aeb5954e3cff2206a583855bbda2a064bd9bd465ac2ce5ee61078aa0","last_reissued_at":"2026-05-27T01:05:17.949253Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:05:17.949253Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.26438","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:05:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"O9mY/OSw2b1RDjFJfxZD2uGUTwMpH9ZNryLYQE1/YGA1K1LkmH8sPIpT7W1opFW6P/K0kSa0CS5xy/GrnlgnDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-28T06:12:34.782436Z"},"content_sha256":"69f8cf942db11a026373434d60911b49120735624d45606f8984dbe25669fdf0","schema_version":"1.0","event_id":"sha256:69f8cf942db11a026373434d60911b49120735624d45606f8984dbe25669fdf0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:W7HLAZNOWWKU4PH7EIDKLA4FLO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"LURE: Live-Usage Replay Evaluations for Reducing Evaluation Awareness","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"David Demitri Africa, Igor Ivanov","submitted_at":"2026-04-08T00:04:19Z","abstract_excerpt":"Large language models can recognize when they are being evaluated (evaluation awareness) and behave differently because of that, which undermines the validity of safety and alignment benchmarks. We propose LURE (Live-Usage Replay Evaluations), a method for constructing deployment-like evaluations by replaying realistic agentic interaction trajectories and appending evaluation prompt at the end. We also introduce an automated pipeline for measuring evaluation realism, combining detection of verbalized evaluation awareness and judge-model estimates of the probability of logs being an evaluation,"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.26438","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.26438/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:05:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"30xEIPaPcqngAx582tl6UhGk7VZym1xKF1uV7SYTWeGgWZiymoR97bnynuqCPBA8dmTv/yNciAQ5QK3xgop3CQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-28T06:12:34.782812Z"},"content_sha256":"314575e6729c46f065da2a7b4f3a8d71952e6a9b7f8570215a187a6ea8c1b6b4","schema_version":"1.0","event_id":"sha256:314575e6729c46f065da2a7b4f3a8d71952e6a9b7f8570215a187a6ea8c1b6b4"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/W7HLAZNOWWKU4PH7EIDKLA4FLO/bundle.json","state_url":"https://pith.science/pith/W7HLAZNOWWKU4PH7EIDKLA4FLO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/W7HLAZNOWWKU4PH7EIDKLA4FLO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-28T06:12:34Z","links":{"resolver":"https://pith.science/pith/W7HLAZNOWWKU4PH7EIDKLA4FLO","bundle":"https://pith.science/pith/W7HLAZNOWWKU4PH7EIDKLA4FLO/bundle.json","state":"https://pith.science/pith/W7HLAZNOWWKU4PH7EIDKLA4FLO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/W7HLAZNOWWKU4PH7EIDKLA4FLO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:W7HLAZNOWWKU4PH7EIDKLA4FLO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"feeceac72cf133497f9c18d08abe5120f07ed02cbc159d3a33d9548f3561a31a","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-08T00:04:19Z","title_canon_sha256":"46b44821d0d751c28588e0cec04fd622824a0441dceed14f04c4bbeb7373ab15"},"schema_version":"1.0","source":{"id":"2605.26438","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.26438","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"arxiv_version","alias_value":"2605.26438v1","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.26438","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"pith_short_12","alias_value":"W7HLAZNOWWKU","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"pith_short_16","alias_value":"W7HLAZNOWWKU4PH7","created_at":"2026-05-27T01:05:17Z"},{"alias_kind":"pith_short_8","alias_value":"W7HLAZNO","created_at":"2026-05-27T01:05:17Z"}],"graph_snapshots":[{"event_id":"sha256:314575e6729c46f065da2a7b4f3a8d71952e6a9b7f8570215a187a6ea8c1b6b4","target":"graph","created_at":"2026-05-27T01:05:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.26438/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large language models can recognize when they are being evaluated (evaluation awareness) and behave differently because of that, which undermines the validity of safety and alignment benchmarks. We propose LURE (Live-Usage Replay Evaluations), a method for constructing deployment-like evaluations by replaying realistic agentic interaction trajectories and appending evaluation prompt at the end. We also introduce an automated pipeline for measuring evaluation realism, combining detection of verbalized evaluation awareness and judge-model estimates of the probability of logs being an evaluation,","authors_text":"David Demitri Africa, Igor Ivanov","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-08T00:04:19Z","title":"LURE: Live-Usage Replay Evaluations for Reducing Evaluation Awareness"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.26438","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:69f8cf942db11a026373434d60911b49120735624d45606f8984dbe25669fdf0","target":"record","created_at":"2026-05-27T01:05:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"feeceac72cf133497f9c18d08abe5120f07ed02cbc159d3a33d9548f3561a31a","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-08T00:04:19Z","title_canon_sha256":"46b44821d0d751c28588e0cec04fd622824a0441dceed14f04c4bbeb7373ab15"},"schema_version":"1.0","source":{"id":"2605.26438","kind":"arxiv","version":1}},"canonical_sha256":"b7ceb065aeb5954e3cff2206a583855bbda2a064bd9bd465ac2ce5ee61078aa0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b7ceb065aeb5954e3cff2206a583855bbda2a064bd9bd465ac2ce5ee61078aa0","first_computed_at":"2026-05-27T01:05:17.949253Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T01:05:17.949253Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"lC2bQ+bC0myDdztazfxFVhBBm6ba90Hj4CeqojAm8zvFUGwf7oYoiEjTP66oQpzc+gpgDB/rOtXjtb/9VeKsCw==","signature_status":"signed_v1","signed_at":"2026-05-27T01:05:17.949897Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.26438","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:69f8cf942db11a026373434d60911b49120735624d45606f8984dbe25669fdf0","sha256:314575e6729c46f065da2a7b4f3a8d71952e6a9b7f8570215a187a6ea8c1b6b4"],"state_sha256":"955e38e5e78478ed71fb47036bc5f1506b049e0ebc41173f3b5a3cab9c41b4b3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"XBN68QogTRbTh9Pao1pk6DckDRP7hKdgTNlqyF97TmpfzRMJNgkLSwXi8pvn0Lq7ZWKw0QcN57sJ4vUlc6dFAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-28T06:12:34.784709Z","bundle_sha256":"3bdf5c72f8bb6acd99b88af8d4cf680aa29047c470cb11fe230d0b1c8f6d43aa"}}