{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ECRKBHSRQLNCLXZGKE3Z3FSAEQ","short_pith_number":"pith:ECRKBHSR","schema_version":"1.0","canonical_sha256":"20a2a09e5182da25df2651379d9640243306cc82288c8ab458968b8dde801daf","source":{"kind":"arxiv","id":"2606.29914","version":1},"attestation_state":"computed","paper":{"title":"MemDelta: Controlled Baselines and Hidden Confounds in Agent Memory Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Kuan Wang","submitted_at":"2026-06-29T07:51:22Z","abstract_excerpt":"Agent memory systems are increasingly evaluated against RAG and full-context baselines, but reported gains often mix changes in the memory method with changes in the language model, embedding model, or retrieval pipeline, making it unclear what is actually being measured. We present MemDelta, a controlled evaluation protocol that varies one component at a time on LongMemEval-S (500 questions, 50+ sessions, three model families). Four findings emerge: (1) verbatim RAG matches full-context GPT-4o-mini (47.2% vs. 49.8%, p = 0.34), but the ranking reverses across models: Gemini gains +14pp from fu"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.29914","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-29T07:51:22Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"1dacf6f07d3df3601abe65e24d613851e2d7310a124f57369d23ba8c37e01525","abstract_canon_sha256":"3fd71b435c85e0decd6dadb527f2e543192ba01cd5e95629c889351f1d2ba06d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:17:40.772719Z","signature_b64":"ojMkCgz2d58A88T12eDAwrP8dEOBLFqmHloV+MCOJIcPB3eLRfqFW6gBBYUvwuw3ydUm8BMoz7Vyn/SEs5u1Bg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"20a2a09e5182da25df2651379d9640243306cc82288c8ab458968b8dde801daf","last_reissued_at":"2026-06-30T02:17:40.772132Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:17:40.772132Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MemDelta: Controlled Baselines and Hidden Confounds in Agent Memory Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Kuan Wang","submitted_at":"2026-06-29T07:51:22Z","abstract_excerpt":"Agent memory systems are increasingly evaluated against RAG and full-context baselines, but reported gains often mix changes in the memory method with changes in the language model, embedding model, or retrieval pipeline, making it unclear what is actually being measured. We present MemDelta, a controlled evaluation protocol that varies one component at a time on LongMemEval-S (500 questions, 50+ sessions, three model families). Four findings emerge: (1) verbatim RAG matches full-context GPT-4o-mini (47.2% vs. 49.8%, p = 0.34), but the ranking reverses across models: Gemini gains +14pp from fu"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.29914","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.29914/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.29914","created_at":"2026-06-30T02:17:40.772223+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.29914v1","created_at":"2026-06-30T02:17:40.772223+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.29914","created_at":"2026-06-30T02:17:40.772223+00:00"},{"alias_kind":"pith_short_12","alias_value":"ECRKBHSRQLNC","created_at":"2026-06-30T02:17:40.772223+00:00"},{"alias_kind":"pith_short_16","alias_value":"ECRKBHSRQLNCLXZG","created_at":"2026-06-30T02:17:40.772223+00:00"},{"alias_kind":"pith_short_8","alias_value":"ECRKBHSR","created_at":"2026-06-30T02:17:40.772223+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ","json":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ.json","graph_json":"https://pith.science/api/pith-number/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/graph.json","events_json":"https://pith.science/api/pith-number/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/events.json","paper":"https://pith.science/paper/ECRKBHSR"},"agent_actions":{"view_html":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ","download_json":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ.json","view_paper":"https://pith.science/paper/ECRKBHSR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.29914&json=true","fetch_graph":"https://pith.science/api/pith-number/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/graph.json","fetch_events":"https://pith.science/api/pith-number/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/action/storage_attestation","attest_author":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/action/author_attestation","sign_citation":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/action/citation_signature","submit_replication":"https://pith.science/pith/ECRKBHSRQLNCLXZGKE3Z3FSAEQ/action/replication_record"}},"created_at":"2026-06-30T02:17:40.772223+00:00","updated_at":"2026-06-30T02:17:40.772223+00:00"}