{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ROSGYSMEV4EN54I3REH6LNWMPY","short_pith_number":"pith:ROSGYSME","schema_version":"1.0","canonical_sha256":"8ba46c4984af08def11b890fe5b6cc7e350d92f1e553098186af0f177a134f38","source":{"kind":"arxiv","id":"2605.13122","version":1},"attestation_state":"computed","paper":{"title":"Early Semantic Grounding in Image Editing Models for Zero-Shot Referring Image Segmentation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Instruction-based image editing models show strong foreground-background separability in their earliest internal features, enabling zero-shot referring image segmentation from a single denoising step.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chang Xu, Jingxuan He, Mengyu Zheng, Xiyu Wang, Yunke Wang","submitted_at":"2026-05-13T07:48:05Z","abstract_excerpt":"Instruction-based image editing (IIE) models have recently demonstrated strong capability in modifying specific image regions according to natural language instructions, which implicitly requires identifying where an edit should be applied. This indicates that such models inherently perform language-conditioned visual semantic grounding. In this work, we investigate whether this implicit grounding can be leveraged for zero-shot referring image segmentation (RIS), a task that requires pixel-level localization of objects described by natural language expressions. Through systematic analysis, we "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.13122","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T07:48:05Z","cross_cats_sorted":[],"title_canon_sha256":"f72f8d5e2f63100ced3025e4845c319ec11eeb88ea3501a2329ba11189e7d739","abstract_canon_sha256":"614f1fdbcde535dbabd02616efc86d5812e89a52ed3325aac570ea71a8af6888"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:08:57.948197Z","signature_b64":"hpD9wg/HWWm9rixyd0fywwpFHco2j0Od3LmF9BTmt/hSmKOVD0Ad1fzf/4G6H+7OUBJtBD6QAOBXCtgB8wG+CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8ba46c4984af08def11b890fe5b6cc7e350d92f1e553098186af0f177a134f38","last_reissued_at":"2026-05-18T03:08:57.947708Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:08:57.947708Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Early Semantic Grounding in Image Editing Models for Zero-Shot Referring Image Segmentation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Instruction-based image editing models show strong foreground-background separability in their earliest internal features, enabling zero-shot referring image segmentation from a single denoising step.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chang Xu, Jingxuan He, Mengyu Zheng, Xiyu Wang, Yunke Wang","submitted_at":"2026-05-13T07:48:05Z","abstract_excerpt":"Instruction-based image editing (IIE) models have recently demonstrated strong capability in modifying specific image regions according to natural language instructions, which implicitly requires identifying where an edit should be applied. This indicates that such models inherently perform language-conditioned visual semantic grounding. In this work, we investigate whether this implicit grounding can be leveraged for zero-shot referring image segmentation (RIS), a task that requires pixel-level localization of objects described by natural language expressions. Through systematic analysis, we "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"strong foreground-background separability emerges in the internal representations of these models at the earliest denoising timestep, well before any visible image transformation occurs","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The feature-space separability observed at the earliest denoising timestep is sufficient to produce accurate pixel-level segmentation masks for arbitrary referring expressions without full synthesis or task-specific training.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Pretrained instruction-based image editing models exhibit early foreground-background separability that enables a training-free framework for zero-shot referring image segmentation using a single denoising step.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Instruction-based image editing models show strong foreground-background separability in their earliest internal features, enabling zero-shot referring image segmentation from a single denoising step.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"eabb67b17ba6465abf20fadc60b701d5a5722eab497e852acc1b8863f33b2221"},"source":{"id":"2605.13122","kind":"arxiv","version":1},"verdict":{"id":"8f5ee322-ee73-4548-a9d9-61cb1b829cbb","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T19:39:49.067021Z","strongest_claim":"strong foreground-background separability emerges in the internal representations of these models at the earliest denoising timestep, well before any visible image transformation occurs","one_line_summary":"Pretrained instruction-based image editing models exhibit early foreground-background separability that enables a training-free framework for zero-shot referring image segmentation using a single denoising step.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The feature-space separability observed at the earliest denoising timestep is sufficient to produce accurate pixel-level segmentation masks for arbitrary referring expressions without full synthesis or task-specific training.","pith_extraction_headline":"Instruction-based image editing models show strong foreground-background separability in their earliest internal features, enabling zero-shot referring image segmentation from a single denoising step."},"references":{"count":54,"sample":[{"doi":"","year":2025,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":1,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":2023,"title":"Instructpix2pix: Learning to follow image editing instructions","work_id":"182396b0-39b5-4e35-adfd-c6dbe856535f","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Z-Image: An Efficient Image Generation Foundation Model with Single-Stream Diffusion Transformer","work_id":"f1080a62-48e1-4255-b023-7556be57370d","ref_index":3,"cited_arxiv_id":"2511.22699","is_internal_anchor":true},{"doi":"","year":2025,"title":"SAM 3: Segment Anything with Concepts","work_id":"4a72a006-2592-4554-aad0-a9c41a9f952d","ref_index":4,"cited_arxiv_id":"2511.16719","is_internal_anchor":true},{"doi":"","year":2025,"title":"Dvin: Dynamic visual routing network for weakly supervised referring expression comprehension","work_id":"aaa61b18-11cc-47b4-9cac-ccc82dc480a2","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":54,"snapshot_sha256":"93726049bd15d0d910069735869305e6899abdbcf7b2b09790aa20c5b368813c","internal_anchors":8},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13122","created_at":"2026-05-18T03:08:57.947774+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13122v1","created_at":"2026-05-18T03:08:57.947774+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13122","created_at":"2026-05-18T03:08:57.947774+00:00"},{"alias_kind":"pith_short_12","alias_value":"ROSGYSMEV4EN","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"ROSGYSMEV4EN54I3","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"ROSGYSME","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY","json":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY.json","graph_json":"https://pith.science/api/pith-number/ROSGYSMEV4EN54I3REH6LNWMPY/graph.json","events_json":"https://pith.science/api/pith-number/ROSGYSMEV4EN54I3REH6LNWMPY/events.json","paper":"https://pith.science/paper/ROSGYSME"},"agent_actions":{"view_html":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY","download_json":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY.json","view_paper":"https://pith.science/paper/ROSGYSME","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13122&json=true","fetch_graph":"https://pith.science/api/pith-number/ROSGYSMEV4EN54I3REH6LNWMPY/graph.json","fetch_events":"https://pith.science/api/pith-number/ROSGYSMEV4EN54I3REH6LNWMPY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY/action/storage_attestation","attest_author":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY/action/author_attestation","sign_citation":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY/action/citation_signature","submit_replication":"https://pith.science/pith/ROSGYSMEV4EN54I3REH6LNWMPY/action/replication_record"}},"created_at":"2026-05-18T03:08:57.947774+00:00","updated_at":"2026-05-18T03:08:57.947774+00:00"}