{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:P5NQA5OZORBOOZHJNBVCWXAP2K","short_pith_number":"pith:P5NQA5OZ","schema_version":"1.0","canonical_sha256":"7f5b0075d97442e764e9686a2b5c0fd2bf98da05e378b423617eef72927de741","source":{"kind":"arxiv","id":"2602.05746","version":2},"attestation_state":"computed","paper":{"title":"Learning to Inject: Automated Prompt Injection via Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Florian Tram\\`er, Jie Zhang, Xin Chen","submitted_at":"2026-02-05T15:14:46Z","abstract_excerpt":"Prompt injection is a critical vulnerability in LLM agents, yet the strongest methods still rely on human red-teamers and hand-crafted prompts. Adapting automated jailbreak optimizers does not close this gap: jailbreaks shape models toward generic compliance, while prompt injection requires emitting specific tool calls with correct parameters. The success signal is binary, and randomly sampled suffixes almost never trigger it, so standard optimizers have no gradient to follow. We present AutoInject, a black-box reinforcement learning (RL) framework that learns adversarial suffixes for prompt i"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.05746","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-05T15:14:46Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"619f11ed364aa2b0b5bacf678efa819882e18d8d878107174b04ef3afe1f0ebe","abstract_canon_sha256":"8f209f45a249bf2c5815f22236aa7cf4e074ffb8e6e8e165573c5289eb8c8c33"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-11T01:09:28.663009Z","signature_b64":"zJZhBoJVx3jzp3RlIHNAtkWXpKnPwuAKnlWoCL6X7/H+ElAFjSDwC6Kw/99lvPQVDLb1WzgBzXniWI2OvFMnAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7f5b0075d97442e764e9686a2b5c0fd2bf98da05e378b423617eef72927de741","last_reissued_at":"2026-06-11T01:09:28.661988Z","signature_status":"signed_v1","first_computed_at":"2026-06-11T01:09:28.661988Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Learning to Inject: Automated Prompt Injection via Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Florian Tram\\`er, Jie Zhang, Xin Chen","submitted_at":"2026-02-05T15:14:46Z","abstract_excerpt":"Prompt injection is a critical vulnerability in LLM agents, yet the strongest methods still rely on human red-teamers and hand-crafted prompts. Adapting automated jailbreak optimizers does not close this gap: jailbreaks shape models toward generic compliance, while prompt injection requires emitting specific tool calls with correct parameters. The success signal is binary, and randomly sampled suffixes almost never trigger it, so standard optimizers have no gradient to follow. We present AutoInject, a black-box reinforcement learning (RL) framework that learns adversarial suffixes for prompt i"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.05746","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.05746/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.05746","created_at":"2026-06-11T01:09:28.662136+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.05746v2","created_at":"2026-06-11T01:09:28.662136+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.05746","created_at":"2026-06-11T01:09:28.662136+00:00"},{"alias_kind":"pith_short_12","alias_value":"P5NQA5OZORBO","created_at":"2026-06-11T01:09:28.662136+00:00"},{"alias_kind":"pith_short_16","alias_value":"P5NQA5OZORBOOZHJ","created_at":"2026-06-11T01:09:28.662136+00:00"},{"alias_kind":"pith_short_8","alias_value":"P5NQA5OZ","created_at":"2026-06-11T01:09:28.662136+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.28467","citing_title":"Mitigating Adaptive Attacks against Reasoning Models with Activation Consistency Training","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11039","citing_title":"The Granularity Mismatch in Agent Security: Argument-Level Provenance Solves Enforcement and Isolates the LLM Reasoning Bottleneck","ref_index":1,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K","json":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K.json","graph_json":"https://pith.science/api/pith-number/P5NQA5OZORBOOZHJNBVCWXAP2K/graph.json","events_json":"https://pith.science/api/pith-number/P5NQA5OZORBOOZHJNBVCWXAP2K/events.json","paper":"https://pith.science/paper/P5NQA5OZ"},"agent_actions":{"view_html":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K","download_json":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K.json","view_paper":"https://pith.science/paper/P5NQA5OZ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.05746&json=true","fetch_graph":"https://pith.science/api/pith-number/P5NQA5OZORBOOZHJNBVCWXAP2K/graph.json","fetch_events":"https://pith.science/api/pith-number/P5NQA5OZORBOOZHJNBVCWXAP2K/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K/action/timestamp_anchor","attest_storage":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K/action/storage_attestation","attest_author":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K/action/author_attestation","sign_citation":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K/action/citation_signature","submit_replication":"https://pith.science/pith/P5NQA5OZORBOOZHJNBVCWXAP2K/action/replication_record"}},"created_at":"2026-06-11T01:09:28.662136+00:00","updated_at":"2026-06-11T01:09:28.662136+00:00"}