{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:AM5CTMEE2V7MJWXVZ2SV2PUNSC","short_pith_number":"pith:AM5CTMEE","schema_version":"1.0","canonical_sha256":"033a29b084d57ec4daf5cea55d3e8d90bc88bbfcbb1e94a8bbb82b16a7fd624e","source":{"kind":"arxiv","id":"2605.18740","version":1},"attestation_state":"computed","paper":{"title":"Vision-OPD: Learning to See Fine Details for Multimodal LLMs via On-Policy Self-Distillation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.CV","authors_text":"Hongyu Lin, Jie Lou, Le Sun, Qianhao Yuan, Xianpei Han, Xing Yu, Yaojie Lu","submitted_at":"2026-05-18T17:57:04Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) still struggle with fine-grained visual understanding, where answers often depend on small but decisive evidence in the full image. We observe a regional-to-global perception gap: the same MLLM answers fine-grained questions more accurately when conditioned on evidence-centered crops than on the corresponding full images, suggesting that many failures stem from difficulty to focus on relevant evidence rather than insufficient local recognition ability. Motivated by this observation, we propose Vision-OPD (Vision On-Policy Distillation), a regional-to-gl"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.18740","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:57:04Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"title_canon_sha256":"da30378a87b9eaa6421123f1cd2520aeb4bbcee9e465da22493134b6c2050675","abstract_canon_sha256":"07aace2f706900f01f124afa67312008c99f09ec4633c35fd72b868be220b86a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:06:18.504169Z","signature_b64":"alVsX29CGkBDG4bqLfxdNBjAj9rk7ytgkhv2GR/yToEMTM4IZWkCAxJ/65+MSqpWcFfuhsXJn2r5ozExO80aCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"033a29b084d57ec4daf5cea55d3e8d90bc88bbfcbb1e94a8bbb82b16a7fd624e","last_reissued_at":"2026-05-20T00:06:18.503363Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:06:18.503363Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Vision-OPD: Learning to See Fine Details for Multimodal LLMs via On-Policy Self-Distillation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.CV","authors_text":"Hongyu Lin, Jie Lou, Le Sun, Qianhao Yuan, Xianpei Han, Xing Yu, Yaojie Lu","submitted_at":"2026-05-18T17:57:04Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) still struggle with fine-grained visual understanding, where answers often depend on small but decisive evidence in the full image. We observe a regional-to-global perception gap: the same MLLM answers fine-grained questions more accurately when conditioned on evidence-centered crops than on the corresponding full images, suggesting that many failures stem from difficulty to focus on relevant evidence rather than insufficient local recognition ability. Motivated by this observation, we propose Vision-OPD (Vision On-Policy Distillation), a regional-to-gl"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18740","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.18740/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T00:01:59.000399Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"399a1904d677a4c7b6c67a58d132b5fa5a0254287c5310fc523ee66ec505a85d"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.18740","created_at":"2026-05-20T00:06:18.503481+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.18740v1","created_at":"2026-05-20T00:06:18.503481+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.18740","created_at":"2026-05-20T00:06:18.503481+00:00"},{"alias_kind":"pith_short_12","alias_value":"AM5CTMEE2V7M","created_at":"2026-05-20T00:06:18.503481+00:00"},{"alias_kind":"pith_short_16","alias_value":"AM5CTMEE2V7MJWXV","created_at":"2026-05-20T00:06:18.503481+00:00"},{"alias_kind":"pith_short_8","alias_value":"AM5CTMEE","created_at":"2026-05-20T00:06:18.503481+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC","json":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC.json","graph_json":"https://pith.science/api/pith-number/AM5CTMEE2V7MJWXVZ2SV2PUNSC/graph.json","events_json":"https://pith.science/api/pith-number/AM5CTMEE2V7MJWXVZ2SV2PUNSC/events.json","paper":"https://pith.science/paper/AM5CTMEE"},"agent_actions":{"view_html":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC","download_json":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC.json","view_paper":"https://pith.science/paper/AM5CTMEE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.18740&json=true","fetch_graph":"https://pith.science/api/pith-number/AM5CTMEE2V7MJWXVZ2SV2PUNSC/graph.json","fetch_events":"https://pith.science/api/pith-number/AM5CTMEE2V7MJWXVZ2SV2PUNSC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC/action/storage_attestation","attest_author":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC/action/author_attestation","sign_citation":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC/action/citation_signature","submit_replication":"https://pith.science/pith/AM5CTMEE2V7MJWXVZ2SV2PUNSC/action/replication_record"}},"created_at":"2026-05-20T00:06:18.503481+00:00","updated_at":"2026-05-20T00:06:18.503481+00:00"}