{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:RYDGGZC5NYXVCSUQ2D6UBH7NEI","short_pith_number":"pith:RYDGGZC5","schema_version":"1.0","canonical_sha256":"8e0663645d6e2f514a90d0fd409fed22359e0719609cc6065cc9d67126725ab2","source":{"kind":"arxiv","id":"2501.18322","version":2},"attestation_state":"computed","paper":{"title":"A Unified Perspective on the Dynamics of Deep Transformers","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["math.AP"],"primary_cat":"cs.LG","authors_text":"Gabriel Peyr\\'e, Jos\\'e Antonio Carrillo, Pierre Ablin, Val\\'erie Castin","submitted_at":"2025-01-30T13:04:54Z","abstract_excerpt":"Transformers, which are state-of-the-art in most machine learning tasks, represent the data as sequences of vectors called tokens. This representation is then exploited by the attention function, which learns dependencies between tokens and is key to the success of Transformers. However, the iterative application of attention across layers induces complex dynamics that remain to be fully understood. To analyze these dynamics, we identify each input sequence with a probability measure and model its evolution as a Vlasov equation called Transformer PDE, whose velocity field is non-linear in the "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2501.18322","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-01-30T13:04:54Z","cross_cats_sorted":["math.AP"],"title_canon_sha256":"8a9ffbc1701f0e830a7614903b2e85e69501b176edf327c6e3a8972318d20154","abstract_canon_sha256":"d541983e1a8e3a467ddef50e8861b730360f7e450d40b7810cc7ef39aa3a3e12"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:12:13.265938Z","signature_b64":"gAmiBEz+FbWzutClaoPJ1YCW+JmhzEZtOdsEnU851429codb+B/OpaAikX/Zj3Gq3rd7o7HUrWoESqIxamQYDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8e0663645d6e2f514a90d0fd409fed22359e0719609cc6065cc9d67126725ab2","last_reissued_at":"2026-06-19T16:12:13.265548Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:12:13.265548Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"A Unified Perspective on the Dynamics of Deep Transformers","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["math.AP"],"primary_cat":"cs.LG","authors_text":"Gabriel Peyr\\'e, Jos\\'e Antonio Carrillo, Pierre Ablin, Val\\'erie Castin","submitted_at":"2025-01-30T13:04:54Z","abstract_excerpt":"Transformers, which are state-of-the-art in most machine learning tasks, represent the data as sequences of vectors called tokens. This representation is then exploited by the attention function, which learns dependencies between tokens and is key to the success of Transformers. However, the iterative application of attention across layers induces complex dynamics that remain to be fully understood. To analyze these dynamics, we identify each input sequence with a probability measure and model its evolution as a Vlasov equation called Transformer PDE, whose velocity field is non-linear in the "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2501.18322","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2501.18322/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2501.18322","created_at":"2026-06-19T16:12:13.265605+00:00"},{"alias_kind":"arxiv_version","alias_value":"2501.18322v2","created_at":"2026-06-19T16:12:13.265605+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.18322","created_at":"2026-06-19T16:12:13.265605+00:00"},{"alias_kind":"pith_short_12","alias_value":"RYDGGZC5NYXV","created_at":"2026-06-19T16:12:13.265605+00:00"},{"alias_kind":"pith_short_16","alias_value":"RYDGGZC5NYXVCSUQ","created_at":"2026-06-19T16:12:13.265605+00:00"},{"alias_kind":"pith_short_8","alias_value":"RYDGGZC5","created_at":"2026-06-19T16:12:13.265605+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":11,"internal_anchor_count":11,"sample":[{"citing_arxiv_id":"2504.14697","citing_title":"Quantitative Clustering in Mean-Field Transformer Models","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2509.01685","citing_title":"Preconditioned Regularized Wasserstein Proximal Sampling","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15608","citing_title":"Transformer-like Inference from Optimal Control","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18870","citing_title":"Multi-Headed Transformer Architectures as Time-dependent Wasserstein Gradient Flows","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16747","citing_title":"Propagation of Chaos in Contextual Flow Maps","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11059","citing_title":"Uniform Scaling Limits in AdamW-Trained Transformers","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26898","citing_title":"Stochastic Scaling Limits and Synchronization by Noise in Deep Transformer Models","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10931","citing_title":"Quantifying Concentration Phenomena of Mean-Field Transformers in the Low-Temperature Regime","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10775","citing_title":"On the global convergence of gradient descent for wide shallow models with bounded nonlinearities","ref_index":85,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09213","citing_title":"Kinetic theory for Transformers and the lost-in-the-middle phenomenon","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26085","citing_title":"Spectral Selection in Symmetric Self-Attention Dynamics","ref_index":5,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI","json":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI.json","graph_json":"https://pith.science/api/pith-number/RYDGGZC5NYXVCSUQ2D6UBH7NEI/graph.json","events_json":"https://pith.science/api/pith-number/RYDGGZC5NYXVCSUQ2D6UBH7NEI/events.json","paper":"https://pith.science/paper/RYDGGZC5"},"agent_actions":{"view_html":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI","download_json":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI.json","view_paper":"https://pith.science/paper/RYDGGZC5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2501.18322&json=true","fetch_graph":"https://pith.science/api/pith-number/RYDGGZC5NYXVCSUQ2D6UBH7NEI/graph.json","fetch_events":"https://pith.science/api/pith-number/RYDGGZC5NYXVCSUQ2D6UBH7NEI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI/action/storage_attestation","attest_author":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI/action/author_attestation","sign_citation":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI/action/citation_signature","submit_replication":"https://pith.science/pith/RYDGGZC5NYXVCSUQ2D6UBH7NEI/action/replication_record"}},"created_at":"2026-06-19T16:12:13.265605+00:00","updated_at":"2026-06-19T16:12:13.265605+00:00"}