{"work":{"id":"b928e041-6991-4c08-8c81-0359e4097c7b","openalex_id":null,"doi":null,"arxiv_id":"2303.08774","raw_key":null,"title":"GPT-4 Technical Report","authors":null,"authors_text":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Floren- cia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al","year":2023,"venue":"cs.CL","abstract":"We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales. This allowed us to accurately predict some aspects of GPT-4's performance based on models trained with no more than 1/1,000th the compute of GPT-4.","external_url":"https://arxiv.org/abs/2303.08774","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-30T03:04:13.892250+00:00","pith_arxiv_id":"2303.08774","created_at":"2026-05-08T19:09:02.912635+00:00","updated_at":"2026-06-30T03:04:13.892250+00:00","title_quality_ok":false,"display_title":"GPT-4 Technical Report","render_title":"GPT-4 Technical Report"},"hub":{"state":{"work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1968,"external_cited_by_count":null,"distinct_field_count":61,"first_pith_cited_at":"2022-11-22T21:06:00+00:00","last_pith_cited_at":"2026-06-28T02:28:10+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-06-30T03:49:24.806239+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":401},{"context_role":"method","n":44},{"context_role":"baseline","n":40},{"context_role":"dataset","n":10},{"context_role":"other","n":5},{"context_role":"extension","n":1}],"polarity_counts":[{"context_polarity":"background","n":382},{"context_polarity":"use_method","n":43},{"context_polarity":"baseline","n":40},{"context_polarity":"unclear","n":21},{"context_polarity":"use_dataset","n":10},{"context_polarity":"support","n":4},{"context_polarity":"extend","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"GPT-4 Technical Report","claims":[{"claim_text":"We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core compone","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:43:29.598530+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"0e863d73-ef47-4c74-a918-b0604f1e262a","orcid":null,"display_name":"Josh Achiam"},{"id":"9c33cb4d-bb3b-449d-8dbf-54ce35480fdd","orcid":null,"display_name":"Steven Adler"},{"id":"a38b4d86-1e7d-4c3f-9d8f-f17a82ad89ef","orcid":null,"display_name":"Sandhini Agarwal"},{"id":"a6a44120-a496-4985-81c5-8dfed0f5086a","orcid":null,"display_name":"Lama Ahmad"},{"id":"99b7481f-655a-44b0-928c-b0886021bf12","orcid":null,"display_name":"Ilge Akkaya"},{"id":"f207baee-5035-4549-8e68-001275113125","orcid":null,"display_name":"Floren- cia Leoni Aleman"}]},"error":null,"updated_at":"2026-05-13T17:24:02.706479+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T17:43:29.590252+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":151},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":145},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":121},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":115},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":114},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":111},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":93},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":86},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":83},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":78},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":78},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":75},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":72},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":65},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":64},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":62},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":54},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":53},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":51},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":48},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":44},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":41},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":39},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":38}],"time_series":[{"n":1,"year":2022},{"n":21,"year":2023},{"n":39,"year":2024},{"n":22,"year":2025},{"n":694,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:53.718095+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T17:43:28.834030+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-05-16T02:58:40.487971+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Sport.PeakPerformanceFromJCost","IndisputableMonolith.Sports.PeakPerformanceFromPhiLadder","IndisputableMonolith.Sociology.DunbarFromBandwidth","IndisputableMonolith.Physics.DarkMatterCrossSectionBandScoreCard","IndisputableMonolith.Physics.StandardModelGroupStructure","IndisputableMonolith.Physics.StandardModelLagrangianStructure","IndisputableMonolith.Common.CanonicalJBand","IndisputableMonolith.Materials.RoomTSuperconductorCandidate"],"query_chars":899},"error":null,"updated_at":"2026-05-16T02:58:40.485486+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"GPT-4 Technical Report","claims":[{"claim_text":"We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core compone","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:43:29.595207+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"GPT-4 Technical Report","claims":[{"claim_text":"We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core compone","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.705426+00:00"}},"summary":{"title":"GPT-4 Technical Report","claims":[{"claim_text":"We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core compone","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GPT-4 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":151},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":145},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":121},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":115},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":114},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":111},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":93},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":86},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":83},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":78},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":78},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":75},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":72},{"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","shared_citers":65},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":64},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":62},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":54},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":53},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":51},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":48},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":44},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":41},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":39},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":38}],"time_series":[{"n":1,"year":2022},{"n":21,"year":2023},{"n":39,"year":2024},{"n":22,"year":2025},{"n":694,"year":2026}]},"authors":[{"id":"f207baee-5035-4549-8e68-001275113125","orcid":null,"display_name":"Floren- cia Leoni Aleman","source":"manual","import_confidence":0.72},{"id":"99b7481f-655a-44b0-928c-b0886021bf12","orcid":null,"display_name":"Ilge Akkaya","source":"manual","import_confidence":0.72},{"id":"0e863d73-ef47-4c74-a918-b0604f1e262a","orcid":null,"display_name":"Josh Achiam","source":"manual","import_confidence":0.72},{"id":"a6a44120-a496-4985-81c5-8dfed0f5086a","orcid":null,"display_name":"Lama Ahmad","source":"manual","import_confidence":0.72},{"id":"a38b4d86-1e7d-4c3f-9d8f-f17a82ad89ef","orcid":null,"display_name":"Sandhini Agarwal","source":"manual","import_confidence":0.72},{"id":"9c33cb4d-bb3b-449d-8dbf-54ce35480fdd","orcid":null,"display_name":"Steven Adler","source":"manual","import_confidence":0.72}]},"citers":{"total":1968,"items":[{"citing_arxiv_id":"2606.29155","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OASIF: An Efficient Obfuscation-Aware Self-Improving Framework for LLM-Based Assembly Code Instruction Following and Comprehension","primary_cat":"cs.SE","submitted_at":"2026-06-28T02:28:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OASIF improves open-source LLMs on obfuscated assembly comprehension by 5-17 percentage points on commercial VM obfuscators via a three-phase self-evolving training pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27974","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProMSA:Progressive Multimodal Search Agents for Knowledge-Based Visual Question Answering","primary_cat":"cs.CV","submitted_at":"2026-06-26T11:23:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProMSA is a progressive multimodal search agent for KB-VQA that iteratively selects search tools under budgets, trained via rejection-sampling SFT then TN-GSPO RL, reporting gains on E-VQA and InfoSeek over RAG baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27871","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LocalNav: Distilling Frontier VLMs and Embodied RL for On-Device Object Goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-26T09:11:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Distillation from frontier VLMs plus E-RLVR regularization produces a 4B local model that achieves 34.5% SR on OVON while cutting inference latency by 82.8%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27829","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CSD: Content-aware Speculative Decoding for Efficient Image Generation","primary_cat":"cs.CV","submitted_at":"2026-06-26T08:12:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"CSD adds content-aware entropy relaxation and a distribution alignment filter to speculative decoding, raising acceptance rates in low-detail image areas while keeping output aligned with the target model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27806","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounded Iterative Language Planning: How Parameterized World Models Reduce Hallucination Propagation in LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-06-26T07:45:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GILP combines a small parameterized world model with LLM agent reasoning via a consistency gate, reducing hallucinated-state rate from 0.176 to 0.035 and raising success from 0.668 to 0.838 on graph planning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27786","ref_index":81,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SHIFT: Gate-Modulated Activation Steering for Knowledge Conflict Mitigation in Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-06-26T07:17:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SHIFT reformulates neuron editing as learnable gate modulation on under 0.01% parameters to let LLMs adaptively balance contextual and parametric knowledge during RAG generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27736","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ToE: A Hierarchical and Explainable Claim Verification Framework with Dynamic Multi-source Evidence Retrieval and Aggregation","primary_cat":"cs.AI","submitted_at":"2026-06-26T05:35:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ToE is a hierarchical claim verification framework using RL-driven multi-source retrieval, evidence evaluation, and tree aggregation that reports 4-24 point gains over baselines especially on poisoned inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27632","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Yuvion LLM: An Adversarially-Aware Large Language Model for Content And AI Safety","primary_cat":"cs.CL","submitted_at":"2026-06-26T01:12:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Yuvion LLM applies adversarially aware training and introduces the YLRE benchmark set, claiming superior safety robustness over larger models on multiple tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26566","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adversarial Diffusion Across Modalities: A Fusion Survey of Attacks, Defenses, and Evaluation for Text, Vision, and Vision-Language Models","primary_cat":"cs.CR","submitted_at":"2026-06-25T03:32:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A narrative survey that catalogs fifty papers on diffusion-based adversarial techniques across text, vision, and vision-language models, proposes a six-class taxonomy of diffusion roles plus a unified five-dimension evaluation framework, and releases a companion catalog.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26551","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PhyEditBench: A Real-World Multi-Stage Benchmark for Physics-Aware Image Editing","primary_cat":"cs.CV","submitted_at":"2026-06-25T02:57:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PhyEditBench is a new benchmark for physics-aware image editing with real and synthetic instances plus a training-free PhyWorld baseline that uses test-time scaling to outperform SOTA models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26396","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"At the Edge of Understanding: Sparse Autoencoders Trace The Limits of Transformer Generalization","primary_cat":"cs.LG","submitted_at":"2026-06-24T21:26:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Sparse autoencoders show OOD prompts increase fallacious concept activation in transformers, offering a mechanistic measure of shift and a path to robust fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25927","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Knowledge Cascade: Reverse Knowledge Distillation on Nonparametric Multivariate Functional Estimation","primary_cat":"stat.ME","submitted_at":"2026-06-24T15:06:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KCas transfers student-selected smoothing parameters to full-sample teacher models via asymptotic scaling laws in smoothing splines and kernel methods, cutting computation while retaining performance guarantees.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22942","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding Knowledge Distillation in Post-Training: When It Helps and When It Fails","primary_cat":"cs.CL","submitted_at":"2026-06-22T07:19:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"KD outperforms SFT for LLM post-training in low-data regimes but the advantage fades with abundant data unless the teacher is stronger; a two-stage strategy aids domain-specific low-resource cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21059","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DEFENGRAPH: Knowledge Graph-Enhanced LLMs for Blue Team Cyber Defense","primary_cat":"cs.CR","submitted_at":"2026-06-19T03:10:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DEFENGRAPH integrates a dual-layer static-dynamic KG with LLMs via path retrieval, filtering, and re-ranking, raising reasoning-recall from 61.45% to 73.49% and ticket-action recall from 52.17% to 72.46% on GPT-4o in live red-blue cyber range data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20299","ref_index":291,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Statistical Properties of Training & Generalization","primary_cat":"stat.ML","submitted_at":"2026-06-18T14:35:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Neural scaling laws in deep learning interact with physics constraints and inductive biases beyond classical statistics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20173","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Qiskit Code Migration with LLMs","primary_cat":"cs.SE","submitted_at":"2026-06-18T12:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A taxonomy-guided RAG system with LLMs reduces hallucinations and improves migration suggestions for Qiskit code compared to unconstrained retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19988","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Repository-Level Solidity Code Generation with Large Language Models: From Prompting to Fine-Tuning","primary_cat":"cs.SE","submitted_at":"2026-06-18T09:28:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces SolidityBench benchmark and SolidityScore metric for repository-level Solidity code generation, finding supervised fine-tuning outperforms prompting, CoT, ICL, and RAG methods on evaluated LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19847","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AtomMem: Building Simple and Effective Memory System for LLM Agents via Atomic Facts","primary_cat":"cs.CL","submitted_at":"2026-06-18T06:56:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AtomMem introduces atomic-fact extraction, hierarchical event structures, and an associative memory graph to build stable long-term memory for LLM agents, claiming SOTA results on the LoCoMo benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19640","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Creating Multilingual Mental Health Dialogue Datasets: Limits of Persona-Based Localization via Nationality and Language","primary_cat":"cs.CL","submitted_at":"2026-06-17T22:36:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Modifying nationality and language parameters in English-centric personas for mental health dialogues introduces clinical inconsistencies across languages and causes LLM judges to perform inaccurately on non-English depression severity assessments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18406","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoreMem: Riemannian Retrieval and Fisher-Guided Distillation for Long-Term Memory in Dialogue Agents","primary_cat":"cs.CL","submitted_at":"2026-06-16T18:56:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoreMem replaces cosine retrieval with Fisher-Rao Riemannian matching and introduces Fisher-guided discrete token distillation for syntax-aware compression, reporting +4.51 pp open-domain and +4.17 pp temporal gains on LOCOMO and LongMemEval-S while staying inside an 8 GB VRAM budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26130","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking Like a Scientist? A Structural Study of LLM-Generated Research Methods","primary_cat":"cs.CL","submitted_at":"2026-06-15T14:53:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs given only research questions from 1000 arXiv CS papers recommend a narrower set of methods than the original papers, with effective model-entity diversity dropping from 1232 to 59-96 and stronger agreement among LLMs than with papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11953","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoding Multimodal Cues: Unveiling the Implicit Meaning Behind Hateful Videos","primary_cat":"cs.CL","submitted_at":"2026-06-10T11:28:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces Ex-HateMM and Ex-ImpliHateVid datasets and the IARE framework using multimodal CoT and DPO to achieve explainable hateful video detection with claimed SOTA performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11167","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Faceted Interactivity Alignment in Full-Duplex Speech Models","primary_cat":"cs.CL","submitted_at":"2026-06-09T17:46:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-axis RL alignment technique improves pause handling, turn-taking, backchanneling, and interruption response in full-duplex spoken dialogue models by optimizing axis-specific rewards derived from human audio segments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09508","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Rigid to Dynamic: Entropy-Guided Adaptive Inference for Long-Context LLMs","primary_cat":"cs.AI","submitted_at":"2026-06-08T14:02:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EntropyInfer adaptively allocates inference compute using per-head attention entropy for rigid/dynamic classification during prefilling and compresses KV cache with generated tokens, achieving up to 2.39x speedup on long contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08783","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OptMuon: Closed-Loop Orthogonalized Momentum Methods for Stochastic Optimization with Zero-Noise Optimality","primary_cat":"math.OC","submitted_at":"2026-06-07T18:59:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OptMuon combines orthogonalized momentum with trajectory-dependent AdaGrad-Norm adaptation to obtain expected-stationarity rates of order T^{-1/2} + sigma^{1/2}T^{-1/4} or T^{-1/2} + sigma^{1/3}T^{-1/3} that reduce to near-optimal deterministic first-order rates in the zero-noise regime.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07995","ref_index":87,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Customer-Agent: Overcoming Context Limitations in Ultra-Long Shopping Trajectories via Tool-Augmented Agents and RLVR","primary_cat":"cs.CL","submitted_at":"2026-06-06T06:22:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces ShopTrajQA long-context benchmark and an RLVR-trained tool-augmented agent that bypasses LLM context limits by external file storage and code-based retrieval for shopping trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07246","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MailoHLS: Multi-Adapter Structure-Aware Learning for Pareto-Driven HLS Pragma Optimization","primary_cat":"cs.AR","submitted_at":"2026-06-05T13:15:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MailoHLS combines LLM semantic reasoning and GNN structural modeling with multi-adapter PEFT and Pareto optimization to produce near-Pareto-optimal HLS pragma configurations, reporting up to 12.42x latency speedup on seen kernels and 10.2x on unseen ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05748","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UNIVID: Unified Vision-Language Model for Video Moderation","primary_cat":"cs.MM","submitted_at":"2026-06-04T06:20:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"UNIVID generates policy-aware captions for video moderation, reducing violation leakage by 42.7% and overkill rate by 37.0% while replacing over 1,000 policy-specific models with a single backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09881","ref_index":268,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Calibrated, Fair, and accurate Deepfake Detection","primary_cat":"cs.LG","submitted_at":"2026-06-03T05:44:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Face-Feature Tuning is a label-free logit remapping method that reduces FPR/TPR gaps across groups in deepfake detection while preserving overall accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04302","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LazyAttention: Efficient Retrieval-Augmented Generation with Deferred Positional Encoding","primary_cat":"cs.CL","submitted_at":"2026-06-03T00:12:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LazyAttention kernelizes deferred positional encoding to enable zero-copy, position-agnostic KV cache reuse, delivering 1.37× lower TTFT and 1.40× higher throughput than Block-Attention under skewed document distributions while preserving output quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01451","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Before and After Temperature: A Distributional View of Creative LLM Generation","primary_cat":"cs.CL","submitted_at":"2026-05-31T21:13:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A per-token feature from temperature-induced changes in LLM token distributions predicts within-prompt creativity rank at Spearman rho 0.918 vs LLM judges and 0.870 vs humans, outperforming perplexity, entropy, top-1 margin, and compression baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01414","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent Skills Should Go Beyond Text: The Case for Visual Skills","primary_cat":"cs.CV","submitted_at":"2026-05-31T19:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper proposes that reusable agent skills should incorporate visual elements alongside text, introduces three forms of visual skills and an automatic conversion system, and reports better performance on GUI and visual-centric tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01301","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Med-HEAL: Analyzing and Mitigating Hallucinations in Medical LLMs with Hallucination-Aware In-Context Learning","primary_cat":"cs.CL","submitted_at":"2026-05-31T15:43:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Med-HEAL builds a hallucination dataset from BioMistral answers on EHRNoteQA via GPT-4o and human review, then shows self-critique improves accuracy in three of five tested LLMs without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01208","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Schema-Agnostic Knowledge Graph Construction via Hybrid Ontology Discovery for Cyber Threat Intelligence","primary_cat":"cs.CR","submitted_at":"2026-05-31T12:56:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ANCHOR uses hybrid ontology discovery and SHACL validation to build schema-agnostic CTI knowledge graphs, outperforming baselines on UCO/STIX/MALOnt while matching enterprise LLM performance with local models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01155","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Data Is Scarce: Scaling Sparse Language Models with Repeated Training","primary_cat":"cs.LG","submitted_at":"2026-05-31T10:51:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sparse LLMs in data-scarce multi-epoch regimes follow a scaling law based on active parameters, unique tokens, repetition count, and sparsity level that predicts performance and delays data saturation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01053","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnyEdit++: Adaptive Long-Form Knowledge Editing via Bayesian Surprise","primary_cat":"cs.AI","submitted_at":"2026-05-31T06:48:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AnyEdit++ proposes Bayes-Chunk, an adaptive segmentation method based on Bayesian Surprise, with theoretical claims of structural independence and causal locality, reporting superior results over baselines on math, code, and narrative tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01050","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TextFake: Benchmarking AI-Generated Image Detection on Text-Rich Images","primary_cat":"cs.CV","submitted_at":"2026-05-31T06:42:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TextFake benchmark shows no AI-generated image detector exceeds 80% accuracy on text-rich images and identifies three failure modes including text density and rendering fidelity issues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00931","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CV-Arena: An Open Benchmark for Instructional Computer Vision Problem Solving with Human-AI Collaborative Preferences","primary_cat":"cs.CV","submitted_at":"2026-05-30T23:37:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CV-Arena is a new 12K-pair benchmark for instruction-guided real-image editing with 16 task types, CogRetriever curation, and Active Elo mixed human-AI evaluation that finds gaps in 21 models and presents CV-Agent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07630","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Active Learning with Foundation Model Priors: Efficient Learning under Class Imbalance","primary_cat":"cs.LG","submitted_at":"2026-05-30T23:34:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Active learning with foundation model priors achieves over 50% annotation savings on imbalanced noisy datasets across image and text domains while maintaining performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00925","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking Security Risk Detection and Verification in Open Agentic Skill Ecosystems","primary_cat":"cs.CR","submitted_at":"2026-05-30T23:19:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SkillVetBench is a two-stage benchmark combining natural-language semantic vetting and instrumented sandbox execution to detect and provide runtime evidence for malicious skills in open agent platforms, with experiments showing static methods miss up to 89% of threats.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24894","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RWGBench: Evaluating Scholarly Positioning in Related Work Generation","primary_cat":"cs.DL","submitted_at":"2026-05-30T16:53:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RWGBench is a citation-centric benchmark for related work generation built from 40k CS papers and a 100-paper test set, with multi-dimensional metrics that better match human expert judgment than standard similarity scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00756","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoMIC: Collaborative Memory and Insights Circulation for Long-Horizon LLM Agents in Cloud-Edge Systems","primary_cat":"cs.AI","submitted_at":"2026-05-30T14:45:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoMIC is a parameter-free cloud-edge framework that circulates memory and insights between edge agents and a central critic to improve long-horizon LLM agent performance on symbolic and text tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07623","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Finite Certificates for In-Context Determinacy and a Threshold Theory of Emergence in Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-30T14:07:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proves row-space criterion for finite determinacy in linear finite-field tasks, NP-completeness of minimal forcing subcontext, and anti-mirage theorem separating threshold metrics from semantic confidence via Keisler measures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00708","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MOSAIC: Modular Orchestration for Structured Agentic Intelligence and Composition","primary_cat":"cs.AI","submitted_at":"2026-05-30T12:31:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MOSAIC structures LLM-based model selection via memory-grounded blueprints and failure-aware RL, reporting gains in performance and traceability on financial time-series tasks over AutoML and agent baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00651","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Improving MoE Safety Alignment via Decentralized Expertise","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MESA decentralizes safety duties in MoE LLMs via expert capacity reallocation and dynamic routing refinement based on optimal transport theory, yielding robust defense on harmful benchmarks while preserving helpfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00640","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Attribute-Based Measure of Video Complexity","primary_cat":"cs.CV","submitted_at":"2026-05-30T09:30:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VideoABC estimates video-LLM failure probability via low-dimensional attribute projection, dual quantization (k-means plus lattice), and psychophysics-inspired synthetic data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00619","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemPro: Agentic Memory Systems as Evolvable Programs","primary_cat":"cs.CL","submitted_at":"2026-05-30T08:47:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemPro evolves the entire MCR pipeline as runnable programs via failure-guided refinement on a version tree and outperforms static baselines on LongMemEval, LoCoMo, HotpotQA, and NarrativeQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00576","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Resilient Spatio-Semantic Memory with Hybrid Localization for Mobile Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-30T06:58:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DREAM is a mobile manipulation system that constructs online spatio-semantic voxel memory with redundancy-aware pruning and hybrid language-vision localization, reporting higher long-horizon success rates than DynaMem in dynamic lab scenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00570","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting Parameter-Based Knowledge Editing in Large Language Models: Theoretical Limits and Empirical Evidence","primary_cat":"cs.CL","submitted_at":"2026-05-30T06:44:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Parameter-based knowledge editing in LLMs induces reasoning collapse via dimensional collapse and is consistently outperformed by a retrieval baseline across varied edit counts, knowledge complexity, and evaluation metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00523","ref_index":90,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProactiveLLM: Learning Active Interaction for Streaming Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-30T04:31:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProactiveLLM enables active interaction in streaming LLMs by learning semantic sufficiency cues from partial inputs through mask-based modeling and synchronized privileged self-distillation without external supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00518","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Acting with AI: An Interaction-Based Framework for Agentic Tort Liability","primary_cat":"cs.AI","submitted_at":"2026-05-30T04:17:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes mapping agentic AI harms onto tort doctrines via three interaction types (autonomous drift, pure tool use, collaborative planning) using interaction logs as evidence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00359","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Next-Billion AI Index: The compass for AI utility and adoption in the global majority","primary_cat":"cs.CY","submitted_at":"2026-05-29T21:01:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces nexbax, a diagnostic framework with three themes and 10 dimensions for evaluating AI economic viability, operational practicality, and societal integrity in next-billion-user contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31598","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Linear Scaling Video VLMs for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StateKV is an inference-time technique that replaces quadratic self-attention prefill in video VLMs with a fixed-capacity importance-based recurrent state, keeping accuracy near full attention on long-video benchmarks without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31437","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Astra: a generalizable report generation foundation model for 3D computed tomography","primary_cat":"cs.CV","submitted_at":"2026-05-29T15:35:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Astra is a 3D CT vision-language foundation model trained on 90,678 thoracoabdominal scans that claims 44.1% better diagnostic metrics on internal and six external cohorts plus 29.6% faster chest reporting in real workflows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11238","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Artificial Intelligence in Ship Finance: Applications, Opportunities, and a Case Study in AI-Augmented Loan Origination","primary_cat":"q-fin.GN","submitted_at":"2026-05-29T12:29:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Reviews AI applications in ship finance and presents ShipFinance.ai, a modular LLM-based agentic architecture for automating loan application workflows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31174","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Detect in Any Scene: An Agentic Framework for Object Detection with Experience-Aware Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-29T11:41:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DetAS-X uses an MLLM agent to adaptively compose detection workflows from restoration modules and expert detectors, enhanced by self-evolving experience harvesting, achieving substantial F1 score gains on challenging benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07602","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-Efficient Post-Training for LEGO Spatial-Physics Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PVPO is a sample-efficient RL method that improves semantic, geometric, and physical quality in LLM LEGO assembly generation by mitigating the PhysHack failure mode where validity alone fails to ensure fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31035","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MixFP4: Enhancing NVFP4 with Adaptive FP4/INT4 Block Representations","primary_cat":"cs.AR","submitted_at":"2026-05-29T09:05:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MixFP4 extends NVFP4 by adaptively selecting between two FP4 micro-formats per block using repurposed scale sign bits and a unified E2M2 compute path, claiming better accuracy than standard NVFP4 at 3.1% area and 1.5% power overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30961","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoGens: A Population-Based Heuristic Search Framework for Scientific Idea Generation","primary_cat":"cs.CL","submitted_at":"2026-05-29T07:56:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EvoGens uses rank-based mutation, semantic-aware crossover, and lightweight evaluation to evolve populations of LLM-generated scientific ideas, boosting novelty and diversity metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07600","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reachability and asymptotics of Gaussian Transformer dynamics","primary_cat":"cs.LG","submitted_at":"2026-05-29T07:51:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Gaussian distributions are invariant under the mean-field Transformer flow, reducing infinite-dimensional dynamics to a bilinear control system on mean and covariance with explicit reachability and stability results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30931","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MineExplorer: Evaluating Open-World Exploration of MLLM Agents in Minecraft","primary_cat":"cs.CL","submitted_at":"2026-05-29T07:21:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MineExplorer is a new benchmark for MLLM agents' open-world exploration in Minecraft, using task filtering, ReAct formulation, and multi-agent synthesis to create reliable multi-hop instances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30919","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"De-attribute to Forget for LLM Unlearning","primary_cat":"cs.LG","submitted_at":"2026-05-29T07:03:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DareU reframes LLM unlearning as zeroing data attribution via RL rewards from an LLM classifier approximation, claiming better balance of forget quality and model utility than loss-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30889","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MLIPilot: LLM-Driven Auto-Research for Machine-Learned Interatomic Potentials","primary_cat":"physics.chem-ph","submitted_at":"2026-05-29T06:25:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MLIPilot deploys LLM agents to autonomously optimize MACE MLIP training on molecular and periodic datasets by proposing code edits and validating against a domain-specific scorecard.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07597","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Repetition Mismatch: Why Data Mixture Experiments Don't Scale and How to Fix Them","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:08:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Repetition rate mismatch between small-scale proxies and target budgets is the main reason data mixture experiments do not scale; a subsampling procedure that equalizes repetition rates recovers optimal mixtures from 1/16-scale experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30876","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"dMoE: dLLMs with Learnable Block Experts","primary_cat":"cs.CL","submitted_at":"2026-05-29T06:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dMoE aggregates token expert distributions to block level in dLLMs, cutting unique experts from 69.5 to 14.6, memory by 76-80%, and latency by 1.14-1.66x while retaining 99.11% performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00152","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PrivacyPeek: Auditing What LLM-Based Agents Acquire, Not Just What They Say","primary_cat":"cs.CR","submitted_at":"2026-05-29T04:55:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PrivacyPeek is a benchmark with 1,182 cases across 7 acquisition behaviors and 16 domains that evaluates acquisition-stage privacy leakage in LLM agents, finding it widespread with limited prompt mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30804","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Anchoring LLM Gender Bias to Human Baselines: A Cross-Lingual Audit","primary_cat":"cs.CL","submitted_at":"2026-05-29T03:45:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM gender stereotyping across four languages spans roughly 2.5 times the human cross-country range on HEXACO-100, with translation altering specific stereotyped attributes and effects that can compound.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30784","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Text-guided Feature Disentanglement for Cross-modal Gait Recognition","primary_cat":"cs.CV","submitted_at":"2026-05-29T03:16:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TCFDNet uses a Gait Modality Text Dictionary from LLMs, CLIP alignment, and text-guided disentanglement modules to achieve SOTA cross-modal gait recognition on SUSTech1K and FreeGait.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11232","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Act Has Its Price: Compressed Moral Composition in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moral Trolley Arena shows frontier LLMs produce composite moral preferences that are compressed rather than additive functions of calibrated component act strengths across Moral Foundations Theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24892","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReviewGuard: Aligning LLM-Assisted Peer Review with Long-Term Scientific Impact","primary_cat":"cs.DL","submitted_at":"2026-05-29T02:05:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReviewGuard aligns LLM peer reviews with future citations via impact-aligned RL, achieving Spearman ρ=0.776 on rejected-then-published AI/ML papers versus 0.492 for human reviewers and flagging 5.6× more high-impact cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30713","ref_index":10,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Diversity Matters: Revisiting Test-Time Compute in Vision-Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-29T01:06:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Entropy-based test-time compute (ETTC) in VLM ensembles outperforms majority voting by prioritizing high-confidence predictions from stronger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30637","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EHRBench: An Automated and Reliable EHR-based Benchmark for Clinical Decision Making with LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-28T22:38:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EHRBench uses an EHR-LLM-KB pipeline to automatically create 960,067 reliable QA items spanning diagnosis, treatment, and prognosis for large-scale LLM evaluation in clinical decision making.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00133","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World Models: A Comprehensive Survey of Architectures, Methodologies, Reasoning Paradigms, and Applications","primary_cat":"cs.LG","submitted_at":"2026-05-28T21:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper delivers a multi-axis taxonomy for world models that maps architectures, training families, reasoning strategies, and domains from early cognitive foundations through systems such as Dreamer, MuZero, and Sora while noting evaluation gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30537","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Long-Term Effects of Data Selection in LLM Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-28T20:12:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Short-term data selectors in multi-stage LLM fine-tuning can slow future learning and increase forgetting, formalized as myopic selection with a proposed LHAS objective to address it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30512","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PhyDrawGen: Physically Grounded Diagram Generation from Natural Language","primary_cat":"cs.AI","submitted_at":"2026-05-28T19:49:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PhyDrawGen is a neuro-symbolic pipeline that extracts typed scene graphs via LLM, converts them to physically constrained PSLGs via deterministic solver, and refines via fine-tuned Qwen-VL, claiming superior performance over GPT-5-image and Gemini models on 1,449 physics problems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30317","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VPG: Visual Prefix Guidance for Autoregressive Image and Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:55:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VPG is a training-free inference-time guidance technique that improves autoregressive image and video generation by contrasting model outputs under generated versus corrupted prefixes to strengthen next-step support for the prefix.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30307","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounded 3D-Aware Spatial Vision-Language Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:51:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GR3D is a VLM that combines explicit 2D, implicit 2D, and monocular 3D grounding mechanisms to improve performance on spatial understanding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30406","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AI Loss of Control Incident Management: Response & Resilience","primary_cat":"cs.CY","submitted_at":"2026-05-28T17:47:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Presents a taxonomy for AI loss of control incident management that distinguishes extremely costly versus impossible regaining of control and accidental versus adversarial scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30187","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Modularizing Educational LLM-Agency for Fostering Responsible Learning Assistance","primary_cat":"cs.AI","submitted_at":"2026-05-28T16:31:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes a modular agentic architecture for educational LLMs with stage-specific modules to incorporate pedagogical advice and improve controllability over monolithic chatbots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30170","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unveiling the Visual Counting Bottleneck in Vision-Language Models","primary_cat":"cs.MM","submitted_at":"2026-05-28T16:20:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLMs fail at visual counting extrapolation because they cannot project visual magnitudes onto symbolic tokens, despite intact perceptual representations, supporting a fractured magnitude hypothesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30115","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Large Depth Completion Model from Sparse Observations","primary_cat":"cs.CV","submitted_at":"2026-05-28T15:50:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LDCM achieves state-of-the-art metric depth completion from sparse observations by combining foundation-model initialization with a point-map regression head that removes the need for camera intrinsics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30090","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DirectorBench: Diagnosing Long-Form Video Generation with Personalized Multi-Agent Evaluation","primary_cat":"cs.CL","submitted_at":"2026-05-28T15:35:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DirectorBench is a profile-aware diagnostic benchmark that localizes bottlenecks in long-form video generation workflows using structured checkpoints and multi-agent evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30027","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DocRetriever: A Plug-and-Play Framework for Multimodal Document Retrieval with Comprehensive Benchmark","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:50:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DocRetriever introduces a framework using layout-aware sparse embeddings for hybrid encoding without OCR and a generalizable reasoning-augmented reranker for few-shot settings, plus the MultiDocR benchmark for evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30014","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From GPS Points to Travel Patterns: Flexible and Semantic Trajectory Generation with LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-28T14:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HTP hierarchically generates travel patterns via RQ-VAE tokenization then uses SFT-tuned LLMs to produce conditioned trajectory sequences, outperforming baselines by 29.78% on two datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29940","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Make LLM Learn to Synthesize from Streaming Experiences through Feedback","primary_cat":"cs.AI","submitted_at":"2026-05-28T13:51:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SynLearner lets LLMs improve synthetic data generation on later tasks in a stream by learning reusable patterns and balancing quality with diversity from feedback on earlier tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00123","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CardioLens: Revealing the Clinical Reality Gap of MLLMs via Multi-Sequence Cardiac MRI Evaluations","primary_cat":"cs.CV","submitted_at":"2026-05-28T11:03:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CardioLens is a leakage-resistant CMR testbed of 473k slices and 13k QA pairs showing current MLLMs exhibit a large clinical reality gap with category-collapse failures on real workflows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29744","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Specialist Models Still Matter: A Heterogeneous Multi-Agent Paradigm for Medical Artificial Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-28T10:42:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"HetMedAgent is a heterogeneous multi-agent framework that fuses generalist LLMs and specialist models via conflict-aware fusion and uncertainty triggers, outperforming either alone on three clinical tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29734","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HTAM: Hierarchical Transition-Attended Memory for Operator Optimization","primary_cat":"cs.CL","submitted_at":"2026-05-28T10:29:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HTAM builds a Hierarchical Transition Graph to organize coarse global directions and detailed local strategies for guiding LLM-based CUDA kernel optimization, improving results on KernelBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29675","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Prompts to Context: An Ontology-Driven Framework for Human-Generative AI Collaboration","primary_cat":"cs.HC","submitted_at":"2026-05-28T09:35:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Presents the CCAI ontology and SPARQL retrieval method to convert ephemeral Human-Generative AI prompt interactions into explicit, machine-readable collaboration traces, illustrated in a competency-profile software case study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29639","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RTP-LLM: High-Performance Alibaba LLM Inference Engine","primary_cat":"cs.OS","submitted_at":"2026-05-28T09:07:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RTP-LLM is a new LLM inference engine achieving 4.7x-6.3x model loading speedup and 1.12x-2.52x throughput gains over vLLM and SGLang via disaggregated phases, multi-tier KV cache, and modular optimizations in production at Alibaba.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29588","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Brain-IT-VQA: From Brain Signals to Answers","primary_cat":"cs.CV","submitted_at":"2026-05-28T08:33:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Brain-IT-VQA decodes visual question answers from fMRI using a transformer to extract language tokens and introduces the NSD-VQA benchmark with 20 controlled questions per image across 20 categories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29556","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Opt-Verifier: Unleashing the Power of LLMs for Optimization Modeling via Dual-Side Verification","primary_cat":"cs.AI","submitted_at":"2026-05-28T08:09:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Opt-Verifier adds structure-side and solution-side verification to LLM-generated optimization models and reports over 20% accuracy gains on standard benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29460","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FedSmoothLoRA: Toward Smoother and Faster Convergence in Federated Low-Rank Adaptation","primary_cat":"cs.CV","submitted_at":"2026-05-28T06:53:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FedSmoothLoRA improves federated LoRA fine-tuning by constructing local initializations from a round-matching matrix for cross-round continuity and a gradient-aligned matrix for client-specific guidance, yielding faster convergence than prior methods in image and text tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29454","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Full-Pipeline Framework for Evaluating Membership Inference Attacks in Machine Learning","primary_cat":"cs.LG","submitted_at":"2026-05-28T06:48:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents a systematic framework for evaluating MIAs across the full ML pipeline with standardized threat models and complementary metrics for different cost scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29408","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Large language model for unified and accurate description of multidimensional nuclear properties","primary_cat":"nucl-th","submitted_at":"2026-05-28T06:00:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Fine-tuning DeepSeek-R1-1.5B via LoRA on experimental-theoretical deviations yields over 98% training loss reduction and accuracy gains across seven nuclear observables.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29368","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SURGENT: A Surgical Multi-Agent Assistance System Across the Perioperative Workflow","primary_cat":"cs.CL","submitted_at":"2026-05-28T05:12:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SURGENT is a multi-agent surgical assistance system with novel memory management that outperforms baseline LLMs on case analysis, plan simulation, safety monitoring, risk assessment, and rehabilitation guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29303","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Entropy-KL Divergence-based Token Masking: A Novel Approach for Selective Fine-tuning of Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-28T03:36:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EKSFT masks high-entropy or high-KL tokens in low-data SFT to preserve pre-trained distribution and improve downstream RL performance on math reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29247","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DenseSteer: Steering Small Language Models towards Dense Math Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-28T02:07:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DenseSteer is an inference-time steering framework that improves small LLMs' accuracy on math reasoning by modulating representations toward dense reasoning patterns with fewer but higher-density steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29210","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAMD: A Tool for Identifying False Data Injection Scenarios in AI/ML-enabled Medical Devices","primary_cat":"cs.CR","submitted_at":"2026-05-28T00:44:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SAMD automates STPA-Sec modeling of AI/ML medical devices as control structures to generate false data injection attack scenarios via LLMs and vulnerability data, evaluated on five FDA-cleared devices with reported precisions of 100%, 63.2%, and 95.3%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28920","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Conf-Gen: Conformal Uncertainty Quantification for Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-27T18:00:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Conf-Gen adapts conformal risk control to generative tasks by relaxing assumptions, unifying prior CP work on LLMs and extending guarantees to image generators, conversational AI, and AI agent correctness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}