{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:4RMMTDPLP2HUXUKBA223CB4AMZ","short_pith_number":"pith:4RMMTDPL","schema_version":"1.0","canonical_sha256":"e458c98deb7e8f4bd14106b5b10780667ed8f2e9f5b9668fa1f0368fec62d164","source":{"kind":"arxiv","id":"2505.23678","version":3},"attestation_state":"computed","paper":{"title":"Grounded Reinforcement Learning for Visual Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Aviral Kumar, Ayush Jain, Gabriel Sarch, Katerina Fragkiadaki, Michael J. Tarr, Naitik Khandelwal, Snigdha Saha","submitted_at":"2025-05-29T17:20:26Z","abstract_excerpt":"While reinforcement learning (RL) over chains of thought has significantly advanced language models in tasks such as mathematics and coding, visual reasoning introduces added complexity by requiring models to direct visual attention, interpret perceptual inputs, and ground abstract reasoning in spatial evidence. We introduce ViGoRL (Visually Grounded Reinforcement Learning), a vision-language model trained with RL to explicitly anchor each reasoning step to specific visual coordinates. Inspired by human visual decision-making, ViGoRL learns to produce spatially grounded reasoning traces, guidi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2505.23678","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-05-29T17:20:26Z","cross_cats_sorted":[],"title_canon_sha256":"2325a750a619f2bf4918b0015036d01ea081a11617f88218dd52a918c36ecfb5","abstract_canon_sha256":"42e722d90a276cd68567a1bbef03a739c5d5cc0c04158056d5b4d03e9758c90a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:34.061935Z","signature_b64":"BrU4PjXRPN8vgyJNwWkVkq7cTc63phwjcFd99EmMslGqs4xXLnUsnCxPqYGgivwz9Pjh40QpU76RLwixUs6/AQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e458c98deb7e8f4bd14106b5b10780667ed8f2e9f5b9668fa1f0368fec62d164","last_reissued_at":"2026-05-20T00:01:34.061098Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:34.061098Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Grounded Reinforcement Learning for Visual Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Aviral Kumar, Ayush Jain, Gabriel Sarch, Katerina Fragkiadaki, Michael J. Tarr, Naitik Khandelwal, Snigdha Saha","submitted_at":"2025-05-29T17:20:26Z","abstract_excerpt":"While reinforcement learning (RL) over chains of thought has significantly advanced language models in tasks such as mathematics and coding, visual reasoning introduces added complexity by requiring models to direct visual attention, interpret perceptual inputs, and ground abstract reasoning in spatial evidence. We introduce ViGoRL (Visually Grounded Reinforcement Learning), a vision-language model trained with RL to explicitly anchor each reasoning step to specific visual coordinates. Inspired by human visual decision-making, ViGoRL learns to produce spatially grounded reasoning traces, guidi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2505.23678","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2505.23678/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2505.23678","created_at":"2026-05-20T00:01:34.061223+00:00"},{"alias_kind":"arxiv_version","alias_value":"2505.23678v3","created_at":"2026-05-20T00:01:34.061223+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.23678","created_at":"2026-05-20T00:01:34.061223+00:00"},{"alias_kind":"pith_short_12","alias_value":"4RMMTDPLP2HU","created_at":"2026-05-20T00:01:34.061223+00:00"},{"alias_kind":"pith_short_16","alias_value":"4RMMTDPLP2HUXUKB","created_at":"2026-05-20T00:01:34.061223+00:00"},{"alias_kind":"pith_short_8","alias_value":"4RMMTDPL","created_at":"2026-05-20T00:01:34.061223+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":11,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2605.19528","citing_title":"Towards Camera-Robust 3D Localization: Equation-Anchored Tool-Use for MLLMs","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14692","citing_title":"Chain-of-Glimpse: Search-Guided Progressive Object-Grounded Reasoning for Video Understanding","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15542","citing_title":"DRS-GUI: Dynamic Region Search for Training-Free GUI Grounding","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2509.22746","citing_title":"Mixture-of-Visual-Thoughts: Exploring Context-Adaptive Reasoning Mode Selection for General Visual Reasoning","ref_index":35,"is_internal_anchor":false},{"citing_arxiv_id":"2505.15436","citing_title":"Adaptive Chain-of-Focus Reasoning via Dynamic Visual Search and Zooming for Efficient VLMs","ref_index":36,"is_internal_anchor":false},{"citing_arxiv_id":"2605.13467","citing_title":"PDCR: Perception-Decomposed Confidence Reward for Vision-Language Reasoning","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10445","citing_title":"Uni-Synergy: Bridging Understanding and Generation for Personalized Reasoning via Co-operative Reinforcement Learning","ref_index":10,"is_internal_anchor":false},{"citing_arxiv_id":"2604.12896","citing_title":"Don't Show Pixels, Show Cues: Unlocking Visual Tool Reasoning in Language Models via Perception Programs","ref_index":17,"is_internal_anchor":false},{"citing_arxiv_id":"2604.04707","citing_title":"OpenWorldLib: A Unified Codebase and Definition of Advanced World Models","ref_index":105,"is_internal_anchor":false},{"citing_arxiv_id":"2604.14692","citing_title":"Chain-of-Glimpse: Search-Guided Progressive Object-Grounded Reasoning for Video Understanding","ref_index":9,"is_internal_anchor":false},{"citing_arxiv_id":"2605.02730","citing_title":"Perceptual Flow Network for Visually Grounded Reasoning","ref_index":43,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ","json":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ.json","graph_json":"https://pith.science/api/pith-number/4RMMTDPLP2HUXUKBA223CB4AMZ/graph.json","events_json":"https://pith.science/api/pith-number/4RMMTDPLP2HUXUKBA223CB4AMZ/events.json","paper":"https://pith.science/paper/4RMMTDPL"},"agent_actions":{"view_html":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ","download_json":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ.json","view_paper":"https://pith.science/paper/4RMMTDPL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2505.23678&json=true","fetch_graph":"https://pith.science/api/pith-number/4RMMTDPLP2HUXUKBA223CB4AMZ/graph.json","fetch_events":"https://pith.science/api/pith-number/4RMMTDPLP2HUXUKBA223CB4AMZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ/action/storage_attestation","attest_author":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ/action/author_attestation","sign_citation":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ/action/citation_signature","submit_replication":"https://pith.science/pith/4RMMTDPLP2HUXUKBA223CB4AMZ/action/replication_record"}},"created_at":"2026-05-20T00:01:34.061223+00:00","updated_at":"2026-05-20T00:01:34.061223+00:00"}