{"work":{"id":"042493e9-b26f-4b4e-bbde-382072ca9b08","openalex_id":null,"doi":null,"arxiv_id":"2107.03374","raw_key":null,"title":"Evaluating Large Language Models Trained on Code","authors":null,"authors_text":"Mark Chen et al","year":2021,"venue":"cs.LG","abstract":"We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8% of the problems, while GPT-3 solves 0% and GPT-J solves 11.4%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2% of our problems with 100 samples per problem. Careful investigation of our model reveals its limitations, including difficulty with docstrings describing long chains of operations and with binding operations to variables. Finally, we discuss the potential broader impacts of deploying powerful code generation technologies, covering safety, security, and economics.","external_url":"https://arxiv.org/abs/2107.03374","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-07-04T21:00:08.977440+00:00","pith_arxiv_id":"2107.03374","created_at":"2026-05-08T17:28:41.986595+00:00","updated_at":"2026-07-04T21:00:08.977440+00:00","title_quality_ok":true,"display_title":"Evaluating Large Language Models Trained on Code","render_title":"Evaluating Large Language Models Trained on Code"},"hub":{"state":{"work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1306,"external_cited_by_count":null,"distinct_field_count":40,"first_pith_cited_at":"2021-05-20T17:58:42+00:00","last_pith_cited_at":"2026-07-02T17:30:38+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-07-05T00:27:21.189594+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":161},{"context_role":"dataset","n":64},{"context_role":"method","n":12},{"context_role":"other","n":3},{"context_role":"baseline","n":2}],"polarity_counts":[{"context_polarity":"background","n":157},{"context_polarity":"use_dataset","n":57},{"context_polarity":"use_method","n":12},{"context_polarity":"unclear","n":9},{"context_polarity":"support","n":5},{"context_polarity":"baseline","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Evaluating Large Language Models Trained on Code","claims":[{"claim_text":"We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8% of the problems, while GPT-3 solves 0% and GPT-J solves 11.4%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2% of ou","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Evaluating Large Language Models Trained on Code because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:03:33.884013+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"898ecdc3-b47e-4729-b950-fab2383cfa7a","orcid":null,"display_name":"Mark Chen et al"}]},"error":null,"updated_at":"2026-05-13T18:03:33.881753+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T18:03:33.779895+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":139},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":134},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":86},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":79},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":69},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":61},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":61},{"title":"Code Llama: Open Foundation Models for Code","work_id":"e73bffa4-7620-47ac-9327-259a60db52ca","shared_citers":57},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":51},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":47},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":46},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":40},{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","shared_citers":40},{"title":"DeepSeek-Coder: When the Large Language Model Meets Programming -- The Rise of Code Intelligence","work_id":"f22dae5a-27e2-41d0-a061-c4286418dee3","shared_citers":36},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":36},{"title":"Qwen2.5-Coder Technical Report","work_id":"09ba463d-6377-4017-9801-444ffb94b056","shared_citers":35},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":34},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":33},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":33},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":33},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":31},{"title":"Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters","work_id":"a8d50b24-bdf5-46ed-bc4f-2927dfd81f1d","shared_citers":29},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":28},{"title":"StarCoder: may the source be with you!","work_id":"7e9c3d6e-d6f7-4763-9ef6-de471506c58f","shared_citers":27}],"time_series":[{"n":5,"year":2021},{"n":9,"year":2022},{"n":16,"year":2023},{"n":21,"year":2024},{"n":13,"year":2025},{"n":380,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:55.417193+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T18:03:33.147300+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-06-30T09:39:42.728017+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.StandardModel.SupersymmetryBreaking","IndisputableMonolith.Cognition.AnimalZComplexityBound","IndisputableMonolith.Physics.StandardModelGroupStructure","IndisputableMonolith.Physics.StandardModelLagrangianStructure","IndisputableMonolith.StandardModel.StrongCP","IndisputableMonolith.Cosmology.Inflation","IndisputableMonolith.Patterns.GrayCodeAxioms","IndisputableMonolith.Information.ChurchTuring"],"query_chars":1009},"error":null,"updated_at":"2026-06-30T09:40:04.377210+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Evaluating Large Language Models Trained on Code","claims":[{"claim_text":"We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8% of the problems, while GPT-3 solves 0% and GPT-J solves 11.4%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2% of ou","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Evaluating Large Language Models Trained on Code because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:03:33.782984+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Evaluating Large Language Models Trained on Code","claims":[{"claim_text":"We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8% of the problems, while GPT-3 solves 0% and GPT-J solves 11.4%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2% of ou","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Evaluating Large Language Models Trained on Code because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.714606+00:00"}},"summary":{"title":"Evaluating Large Language Models Trained on Code","claims":[{"claim_text":"We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8% of the problems, while GPT-3 solves 0% and GPT-J solves 11.4%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2% of ou","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Evaluating Large Language Models Trained on Code because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":139},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":134},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":86},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":79},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":69},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":61},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":61},{"title":"Code Llama: Open Foundation Models for Code","work_id":"e73bffa4-7620-47ac-9327-259a60db52ca","shared_citers":57},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":51},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":47},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":46},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":40},{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","shared_citers":40},{"title":"DeepSeek-Coder: When the Large Language Model Meets Programming -- The Rise of Code Intelligence","work_id":"f22dae5a-27e2-41d0-a061-c4286418dee3","shared_citers":36},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":36},{"title":"Qwen2.5-Coder Technical Report","work_id":"09ba463d-6377-4017-9801-444ffb94b056","shared_citers":35},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":34},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":33},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":33},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":33},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":31},{"title":"Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters","work_id":"a8d50b24-bdf5-46ed-bc4f-2927dfd81f1d","shared_citers":29},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":28},{"title":"StarCoder: may the source be with you!","work_id":"7e9c3d6e-d6f7-4763-9ef6-de471506c58f","shared_citers":27}],"time_series":[{"n":5,"year":2021},{"n":9,"year":2022},{"n":16,"year":2023},{"n":21,"year":2024},{"n":13,"year":2025},{"n":380,"year":2026}]},"authors":[{"id":"898ecdc3-b47e-4729-b950-fab2383cfa7a","orcid":null,"display_name":"Mark Chen et al","source":"manual","import_confidence":0.72}]},"citers":{"total":1306,"items":[{"citing_arxiv_id":"2607.02464","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Will Scaling Improve Social Simulation with LLMs?","primary_cat":"cs.CL","submitted_at":"2026-07-02T17:30:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Scaling improves LLM social simulation fidelity in most opinion and behavior tasks but not for human cognitive bias calibration or low-resource domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02390","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DecompRL: Solving Harder Problems by Learning Modular Code Generation","primary_cat":"cs.LG","submitted_at":"2026-07-02T16:25:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DecompRL is an RL method that learns modular code decomposition for LLMs, enabling exponential candidate generation via recombination to solve harder coding problems with lower GPU cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02333","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Guiding Human Validation of LLM-Generated Code via Verifiable Literate Programming","primary_cat":"cs.SE","submitted_at":"2026-07-02T15:37:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLP adds an NL documentation layer with trace-linked mismatch detection and derived formal checks to make human validation of LLM code feasible, lifting pass@1 from 28.7-73.2% to 65.4-93.5%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02329","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounded autonomous research: a fault-tolerant LLM pipeline from corpus to manuscript in frontier computational physics","primary_cat":"cs.AI","submitted_at":"2026-07-02T15:35:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An LLM pipeline with fresh-context sessions and literature calibration produces a publication-grade manuscript with three substantive findings on altermagnetic piezomagnetism from a corpus of 11,083 papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02186","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UA-ChatDev: Uncertainty-Aware Multi-Agent Collaboration for Reliable Software Development","primary_cat":"cs.AI","submitted_at":"2026-07-02T13:56:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"UA-ChatDev integrates token-level uncertainty estimation and phase-aware verification into multi-agent software development and reports better benchmark scores than prior frameworks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02118","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Fitness Intelligence through Domain-Specific LLM Post-Training","primary_cat":"cs.AI","submitted_at":"2026-07-02T12:53:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"FitOne-8B/32B models improve average scores on ACSM-EP and NSCA-CSCS certification exams by up to 12.73% over base Qwen3 while retaining general capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02057","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prompt Coverage Adequacy","primary_cat":"cs.SE","submitted_at":"2026-07-02T11:35:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prompt Coverage Adequacy, measured via attention boosting in LLMs, is associated with fault detection and uncovers over 30% more faults than traditional code coverage when guiding test generation across two datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.02032","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PACE: A Proxy for Agentic Capability Evaluation","primary_cat":"cs.AI","submitted_at":"2026-07-02T10:59:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PACE builds proxy benchmarks from non-agentic instances via relevance and global selection plus regression to predict agentic scores with MAE under 4%, Spearman correlation above 0.80, and 85% ranking accuracy at under 1% cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01953","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Underspecification does not imply Incoherence: The Risks of Semantic Collapse in Coding Models","primary_cat":"cs.SE","submitted_at":"2026-07-02T09:43:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Coding LLMs exhibit detrimental semantic collapse on underspecified prompts by producing consistent but incorrect code rather than incoherent variations, affecting 3-32% of tasks across MBPP, HumanEval, and LiveCodeBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01927","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TUDUM: A Turkish-Thinking Reasoning Pipeline for Qwen3.5-27B","primary_cat":"cs.CL","submitted_at":"2026-07-02T09:22:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"TUDUM applies LoRA-based SFT on 15,991 Turkish reasoning examples followed by GRPO reinforcement learning on Turkish math problems to a 27B Qwen model, producing shorter Turkish reasoning traces with mixed benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01883","ref_index":77,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PairCoder++: Pair Programming as a Universal Paradigm for Verified Code-Driven Multimodal and Structured-Artifact Generation","primary_cat":"cs.CL","submitted_at":"2026-07-02T08:36:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PairCoder is a two-agent pair-programming method that leverages toolchain verification oracles to improve LLM generation of verifiable structured artifacts on 17 benchmarks across seven models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01867","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Exploratory Study on LLM-Generated Code and Comments in Code Repositories","primary_cat":"cs.SE","submitted_at":"2026-07-02T08:25:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Detector proxy analysis of real repositories finds LLM-like code decreasing over time and common in tests, with more in company repos, substantial clones, stable but often ungrammatical comments, and few linked bugs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01855","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Regression Accumulation in Multi-Turn LLM Programming Conversations","primary_cat":"cs.SE","submitted_at":"2026-07-02T08:15:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Regression accumulation affects 40-73% of 8-turn LLM coding tasks on extended HumanEval+/MBPP+ benchmarks, with verification gates improving final-turn pass rates on prior tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01810","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoupling Code Complexity from Newcomer Participation: A Causal Study of AI Coding Agent Adoption in OSS","primary_cat":"cs.SE","submitted_at":"2026-07-02T07:24:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AI coding agent adoption in OSS projects raises code complexity modestly but produces no causal reduction in newcomer participation per DiD estimates on matched GitHub projects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01789","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EPnG: Adaptive Expert Prune-and-Grow for Parameter-Efficient MoE Fine-tuning","primary_cat":"cs.LG","submitted_at":"2026-07-02T07:02:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EPnG reallocates LoRA capacity in MoE models by pruning experts with low router gate probabilities and expanding high-importance ones via rank growth, outperforming standard LoRA and nearing full fine-tuning performance with 0.55-0.72% parameters updated.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01678","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCAPE: Accurate and Efficient LLM Training with Extreme Sparse Communication","primary_cat":"cs.LG","submitted_at":"2026-07-02T04:10:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCAPE enables 90-99% sparse gradient communication in sharded Adam-style LLM training by deriving masks from first-moment statistics, achieving up to 43.3% faster pre-training on Llama-500M with no loss in validation loss or downstream accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01531","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPINE-World: Programmatic World Modeling with Ontology-error-Prioritized Interactive Exploration","primary_cat":"cs.AI","submitted_at":"2026-07-01T23:04:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPINE-World learns programmatic world models from interaction using dual LLM agents and ontology-error exploration, solving 20 of 25 ARC-AGI-3 games without per-game training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01490","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Don't Let Gains FADE: Breaking Down Policy Gradient Weights in RL","primary_cat":"cs.LG","submitted_at":"2026-07-01T21:39:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FADE is a self-adapting advantage for policy-gradient RL that reads training dynamics to balance positive/negative gradient mass and difficulty focus, yielding faster peak performance and better accuracy-diversity trade-offs than static baselines on LLM reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01444","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Utility and Factual Reliability of Pruned Mixture-of-Experts Models in the Biomedical Domain","primary_cat":"cs.LG","submitted_at":"2026-07-01T20:08:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Moderate pruning of MoE models preserves in-domain biomedical utility and reliability but both degrade rapidly in cross-domain settings and at extreme pruning ratios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01360","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking Code Improvement with Progressive, Adaptive, and Interactive Feedback","primary_cat":"cs.SE","submitted_at":"2026-07-01T18:20:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PAIR-Bench defines a progressive hinting protocol with failure-region and hint-depth controls to measure LLM code refinement trajectories in detail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01213","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RepoRescue: An Empirical Study of LLM Agents on Whole-Repository Compatibility Rescue","primary_cat":"cs.SE","submitted_at":"2026-07-01T17:51:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RepoRescue creates a benchmark of 315 repositories and shows LLM agents rescue up to 41.5% with runtime enforcement and 62.7% when combining systems, with hardest cases requiring cross-file changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01211","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Are Performance-Optimization Benchmarks Reliably Measuring Coding Agents?","primary_cat":"cs.SE","submitted_at":"2026-07-01T17:50:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Audit of GSO, SWE-Perf and SWE-fficiency reveals that reference patches satisfy validity rules across machines for only 39/102, 11/140 and 411/498 tasks respectively, public submissions beat references on 85.3% of replay-valid tasks, and scoring rules cause ranking disagreements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01179","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QuasiMoTTo: Quasi-Monte Carlo Test-Time Scaling","primary_cat":"cs.LG","submitted_at":"2026-07-01T17:10:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QuasiMoTTo uses quasi-Monte Carlo to produce correlated yet marginally correct samples from language models, matching i.i.d. pass@k with 25-47% fewer samples on reasoning benchmarks and 50% fewer RL training steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00939","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Leveraging LLM-Based Agentic Systems to Generate Quantum Applications for Test Optimization","primary_cat":"cs.SE","submitted_at":"2026-07-01T13:40:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"QPipe deploys specialized LLM agents for parsing, formulation, code generation, review, execution and verification to produce quantum applications from 20 natural-language test-optimization requirements, reporting 100% compilation and 96.7% execution success with solutions that beat a genetic-algori","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00911","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Registry to Repository: How AI Agent Skills Are Written, Adapted, and Maintained","primary_cat":"cs.SE","submitted_at":"2026-07-01T13:14:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Empirical study of 41k+ AI agent skills finds reuse is mostly one-time verbatim copying with 53% never modified afterward and maintenance focused on additive local adaptations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00908","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Activation Alignment:The Alignment-Diversity Tradeoff in Task-Aware LLM Quantization","primary_cat":"cs.LG","submitted_at":"2026-07-01T13:12:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TASA improves task-aware mixed-precision LLM quantization by searching calibration data mixtures via gradient-trace alignment and aggregating perplexity plus reasoning sensitivity signals, enabling 3.5-bit models to match or beat 4-bit baselines with over 20-point gains on GSM8K.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00711","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClarifyCodeBench: Evaluating LLMs on Clarifying Ambiguous Requirements for Code Generation","primary_cat":"cs.SE","submitted_at":"2026-07-01T09:58:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClarifyCodeBench is a new benchmark with manual annotations and two metrics showing that LLMs strong at code generation are weak at clarifying ambiguous requirements, with performance worsening as ambiguity density rises.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00664","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YOMI-Bench: A Benchmark for Evaluating Kanji Reading and Phonological Understanding of LLMs for Japanese","primary_cat":"cs.CL","submitted_at":"2026-07-01T09:13:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"YOMI-Bench is a new benchmark of four tasks for kanji reading and phonological understanding in LLMs, showing low performance even for Japanese-specific and commercial models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00604","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vehicle Routing Problem Meets Large Language Models: An Overview and Perspectives","primary_cat":"math.OC","submitted_at":"2026-07-01T08:30:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Survey organizing LLM uses for VRP into modeler, designer, and coordinator roles, covering variants, solvers, benchmarks, and two experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00572","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HARC: Coupling Harmfulness and Refusal Directions for Robust Safety Alignment","primary_cat":"cs.AI","submitted_at":"2026-07-01T07:58:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HARC couples harmfulness and refusal directions across prompt and response positions via subspace fine-tuning, achieving better robustness-capability-usability trade-off than six baselines while transferring across model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00531","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Active-GRPO: Adaptive Imitation and Self-Improving Reasoning for Molecular Optimization","primary_cat":"cs.LG","submitted_at":"2026-07-01T07:22:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Active-GRPO reaches 0.1773 average SRxSim on TOMG-Bench MOLOPT by adaptively switching between imitation and self-reinforcement while upgrading references, outperforming GRPO and RePO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00466","ref_index":4,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ELDR: Expert-Locality-Aware Decode Routing for PD-Disaggregated MoE Serving","primary_cat":"cs.DC","submitted_at":"2026-07-01T05:34:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ELDR reduces median TPOT by 5.9-13.9% in PD-disaggregated MoE serving via expert signatures from prefill, K-means partitioning, and locality-band routing with KV-co-indexed signature cache.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00341","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DiscoLoop: Looping Discrete Embeddings and Continuous Hidden States for Multi-hop Reasoning","primary_cat":"cs.CL","submitted_at":"2026-07-01T02:32:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DiscoLoop adds a discrete embedding channel to looped transformers to fix representational misalignment in two-hop reasoning, yielding near-perfect accuracy on synthetic tasks and better pretraining loss on real data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00162","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FRAME: Learning the Adaptation Domain with a Mixture of Fractional-Fourier Experts","primary_cat":"cs.LG","submitted_at":"2026-06-30T20:39:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FRAME adds a learnable fractional-Fourier order per expert in a MoE-LoRA setup so that low-rank updates are placed in the domain where they are most compact, yielding gains over fixed-domain baselines on LLaMA-3.1-8B and Qwen2.5-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00107","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Illusion of Safety: Multi-Tier Verification of AI vs. Human C++ Code","primary_cat":"cs.SE","submitted_at":"2026-06-30T19:48:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Multi-tier verification on VULBENCH-CPP shows AI-generated C++ code triggers confirmed runtime violations roughly twice as often as human code, while static analysis misleadingly indicates parity due to code length.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32007","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AxDafny: Agentic Verified Code Generation in Dafny","primary_cat":"cs.AI","submitted_at":"2026-06-30T17:39:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AxDafny achieves 92.7% verification success on DafnyBench (6.5 points above prior proof-hint baselines) via verifier-guided repair and introduces the LCB-Pro-Dafny benchmark of 250 problems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31717","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nonlinearity-Aware LoRA: Structured Gate Adaptation under Low-Rank Constraints","primary_cat":"cs.LG","submitted_at":"2026-06-30T14:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NA-LoRA introduces derivative-based temporal-importance masks and activation-specific step scaling to LoRA to reduce selection misalignment in self-gated FFNs, with reported gains on language and vision-language fine-tuning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31706","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AdaTrans: Automated C to Rust Transformation via Error-Adaptive Repair","primary_cat":"cs.SE","submitted_at":"2026-06-30T14:11:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdaTrans uses strategy-driven RAG, error-stratified transformation, and multi-stage validation to reach 95.51% mean compilation pass rate and 81.09% solve rate on 104 algorithmic problems with only 1.19% unsafe files.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31551","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AutoTrainess: Teaching Language Models to Improve Language Models Autonomously","primary_cat":"cs.CL","submitted_at":"2026-06-30T12:09:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AutoTrainess exposes training operations via agent-computer interfaces and outperforms CLI-only baselines on PostTrainBench with scores of 26.94 vs 23.21 for GPT-5.4 and similar gains on other models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31511","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Falsification, Not Exposure: An Internally Preregistered Placebo-Controlled Decomposition of Self-Repair Feedback in Frozen Small Code Models","primary_cat":"cs.SE","submitted_at":"2026-06-30T11:26:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Preregistered placebo-controlled decomposition shows external executable counterexamples drive self-repair gains in small code models more than re-exposure or self-critique.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00062","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AlgoBench: Benchmarking Algorithmic Adaptation in Code Generation","primary_cat":"cs.SE","submitted_at":"2026-06-30T10:13:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AlgoBench creates traceable variants of competitive programming problems via constraint shifts that invalidate original algorithms, paired with complexity metrics that reveal LLMs often produce functionally correct but asymptotically unsuitable solutions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31315","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BlockPilot: Instance-Adaptive Policy Learning for Diffusion-based Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-06-30T08:24:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BlockPilot is an instance-adaptive policy that predicts optimal block size from the prefilling representation for diffusion speculative decoding, reporting 5.92 acceptance length and 4.20x speedup on Qwen3-4B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31159","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Empirical Study of Security Calibration in Large Language Models for Code","primary_cat":"cs.SE","submitted_at":"2026-06-30T05:37:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Empirical evaluation of three LLMs finds prevalent overconfidence in insecure code generation, with security calibration outperforming functional calibration but both degrading in repository-level settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31121","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Past Is Prologue: A Plug-in Controller for Selective Updates in Sequentially Evolving LLM Memory","primary_cat":"cs.AI","submitted_at":"2026-06-30T04:33:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Janus is a method-agnostic plug-in that uses a Memory Momentum Trigger and compact hybrid evaluation to selectively accept LLM memory updates, yielding +2.7 to +4.6 accuracy gains over base updaters on six datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00053","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SWE-Router: Routing in Multi-turn Agentic Software Engineering Tasks","primary_cat":"cs.SE","submitted_at":"2026-06-30T01:46:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SWE-Router introduces trajectory-conditioned value-based routing for LLM agents on SWE tasks, with a Bayes-optimality theorem and empirical cost savings while retaining most strong-model performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30810","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Knowledge Alignment in Code LLMs: Contrastive Unlearning for Evolving APIs","primary_cat":"cs.SE","submitted_at":"2026-06-29T18:34:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CURE applies contrastive unlearning to reduce deprecated API usage in code LLMs and improve correct replacements on a benchmark dataset while preserving general performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30602","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Prioritizing Vulnerable Communication Channels for Securing Multi-Agent Systems","primary_cat":"cs.CR","submitted_at":"2026-06-29T17:40:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MESA ranks MAS communication edges by vulnerability via graph-theoretic metrics and dynamic probes, achieving mean Spearman ρ=+0.60 correlation with empirical per-edge attack success and 3x interception gain when monitoring the top 10%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30296","ref_index":1,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ManimAgent: Self-Evolving Multimodal Agents for Visual Education","primary_cat":"cs.AI","submitted_at":"2026-06-29T13:37:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ManimAgent improves Manim animation code generation by maintaining a self-growing dual-channel episodic memory of validated successes and failures derived entirely from its own task stream.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30109","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TacEvo: Self-Evolving Architecture Discovery for Robotic Tactile Perception via LLM-Driven Quality-Diversity Search","primary_cat":"cs.RO","submitted_at":"2026-06-29T10:45:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TacEvo is an LLM-driven self-evolving search method that discovers neural architectures for robotic tactile force regression and grating classification, reporting fitness gains of 56.1% and 96.1% over 20 generations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29999","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AlgoSkill: Learning to Design Algorithms by Scheduling Human-Like Skills","primary_cat":"cs.AI","submitted_at":"2026-06-29T09:09:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AlgoSkill improves LLM algorithm design on programming benchmarks by framing it as verification-guided scheduling over a typed skill library with MCTS, outperforming direct generation and self-refinement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29982","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Uniform Experts: Cost-Aware Expert Execution for Efficient Multi-Device MoE Inference","primary_cat":"cs.DC","submitted_at":"2026-06-29T08:57:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CAEE reduces MoE inference latency 8-18% on 671B DeepSeek-R1 by cost-aware expert pruning and low-overhead compensation while keeping accuracy drop under 1%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29957","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SWE-Together: Evaluating Coding Agents in Interactive User Sessions","primary_cat":"cs.SE","submitted_at":"2026-06-29T08:35:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces SWE-Together benchmark from 109 real repository tasks, using an LLM user simulator to evaluate coding agents on success rate and corrective turns needed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29823","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Experience Graphs: The Data Foundation for Self-Improving Agents","primary_cat":"cs.DB","submitted_at":"2026-06-29T06:02:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Trellis treats agent experience graphs as first-class database state so that search patterns become queries, enabling crash recovery, scaling, and closed-loop training as architectural byproducts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29815","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SrDetection: A Self-Referential Framework for Data Leakage Detection in Code Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-29T05:48:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SrDetection detects data leakage in Code LLMs via contrast between original benchmark samples and their semantic variants, reporting F1 gains of 21.52 (gray-box) and 14.46 (black-box) over baselines in a controlled testbed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29702","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Verified residual-specific explicit derivative kernels for physics-informed learning and discretized PDE adjoints","primary_cat":"physics.comp-ph","submitted_at":"2026-06-29T02:09:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Develops and numerically verifies residual-specific explicit derivative kernels that achieve floating-point agreement with AD while delivering 2-4x speedups and lower memory use in PINN training and CFD adjoint workflows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30689","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Citation Discipline in Spec-Driven Development: A Cross-Model Empirical Study of Output Determinism and Automated Hallucination Detection in LLM-Generated Code","primary_cat":"cs.SE","submitted_at":"2026-06-28T19:38:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mandatory per-line citations in SDD frameworks reduce LLM output determinism but enable reliable automated hallucination detection (TDR 86-88%, FPR 0%), a trade-off replicated across Claude and GLM models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29538","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RESOURCE2SKILL: Distilling Executable Agent Skills from Human-Created Multimodal Resources","primary_cat":"cs.SE","submitted_at":"2026-06-28T17:59:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RESOURCE2SKILL converts multimodal human resources into a hierarchical Skill Wiki of executable agent skills, reporting +11.9 percentage point average gains over no-skill baselines across seven authoring domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29301","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pointer-CAD v2: Plan-Then-Construct CAD Generation with Dimension-Aware Parametric Precision","primary_cat":"cs.CV","submitted_at":"2026-06-28T09:40:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pointer-CAD v2 decouples planning from construction in LLM-based CAD generation by using a pointer mechanism to reference continuous parameters from a design plan, paired with new hierarchical accuracy metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29239","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Breaking the Rounding Trap: Securing LLMs against Quantization-Conditioned Backdoors","primary_cat":"cs.CR","submitted_at":"2026-06-28T07:06:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QuantGuard is a pre-quantization method using differentiable rounding controls, error-guided reversal constraints, output consistency, and weight regularization on a small calibration set to suppress quantization-conditioned backdoors while preserving performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29215","ref_index":20,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Block Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-06-28T05:53:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MBD-LMs raise average tokens per forward pass from 3.47 to 6.19 (and to 9.34 with DMax) via multi-block teacher forcing and optimized parallel decoding while holding or slightly improving accuracy on math and code tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29194","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AI Trading's Alpha Singularity: Emergent Market Reasoning through Agent-to-Agent Self-Evolution","primary_cat":"cs.AI","submitted_at":"2026-06-28T04:41:00+00:00","verdict":"REJECT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Multi-agent LLM system Agora under Sealed Joint Search conditions produces +1.87 holdout Sharpe on CSI 1000 over a 91-day sealed period, exceeding the best baseline at +1.334 under favorable seed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29184","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BaRA: Bayesian Adaptive Rank Allocation for Parameter-Efficient Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-06-28T04:08:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BaRA adds Bayesian adaptive rank allocation to LoRA fine-tuning by activating sparse instance-specific latent factors, with a generalization bound depending on learned joint effective rank rather than fixed maximum rank.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29155","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OASIF: An Efficient Obfuscation-Aware Self-Improving Framework for LLM-Based Assembly Code Instruction Following and Comprehension","primary_cat":"cs.SE","submitted_at":"2026-06-28T02:28:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OASIF improves open-source LLMs on obfuscated assembly comprehension by 5-17 percentage points on commercial VM obfuscators via a three-phase self-evolving training pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29066","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Masked Diffusion Decoding as $x$-Prediction Flow","primary_cat":"cs.CL","submitted_at":"2026-06-27T19:51:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Masked diffusion LMs can use continuous x-prediction flow with token-wise asynchronous updates and an RL policy network to reach 97% performance on HumanEval using only 25% of the usual decoding budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28998","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reward-Free Code Alignment from Pretrained or Fine-Tuned LLM: Unpacking the Trade-offs for Code Generation","primary_cat":"cs.SE","submitted_at":"2026-06-27T16:22:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Empirical study on five LLMs finds pretrained-to-aligned paths yield bigger gains over baseline than finetuned-to-aligned paths, though absolute accuracy remains lower for pretrained starts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28962","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FlipGuard: Defending Large Language Models Against Quantization-Conditioned Backdoor Attacks","primary_cat":"cs.CR","submitted_at":"2026-06-27T15:02:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FlipGuard perturbs LLM weights prior to quantization to neutralize quantization-conditioned backdoor attacks, evaluated via the Defense Effectiveness Ratio on multiple models and quantization schemes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28661","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When More Sampling Hurts: The Modal Ceiling and Correlation Ceiling of Test-Time Scaling","primary_cat":"cs.LG","submitted_at":"2026-06-27T00:37:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Test-time sampling improves coverage but stalls at modal and correlation ceilings for answer selection, with the effective number of samples as the practical limit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28589","ref_index":24,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Search for Truth from Reasoning: A Dynamic Representation Editing Framework for Steering LLM Trajectories","primary_cat":"cs.AI","submitted_at":"2026-06-26T20:33:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DynaSteer is a dynamic representation editing framework that uses pattern clustering, Fisher-LDA, and lookahead entropy monitoring to steer LLM reasoning trajectories toward truth on MATH and coding tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28551","ref_index":36,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DataComp-VLM: Improved Open Datasets for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:11:29+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"DataComp-VLM benchmark shows instruction-heavy data mixing outperforms filtering for VLM training, with DCVLM-Baseline achieving 63.6% on 33 tasks for 8B models (+5.4pp over FineVision).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28229","ref_index":14,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Humanizing Automatically Generated Unit Test Suites with LLM-Based Refactoring","primary_cat":"cs.SE","submitted_at":"2026-06-26T16:18:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TestHumanizer uses LLMs as refactoring layers on EvoSuite suites to reach 88-98% compilation rates and better readability on 350 classes from Defects4J and SF110 while preserving coverage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28166","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tandem Reinforcement Learning with Verifiable Rewards","primary_cat":"cs.AI","submitted_at":"2026-06-26T15:00:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRL extends tandem training to RLVR pipelines, matching GRPO solo reasoning on Qwen3-4B math tasks while improving handoff robustness, reducing distributional drift, and increasing CoT legibility for the junior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28471","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Data and Evaluation Closed-Loop for Model Capability Enhancement","primary_cat":"cs.AI","submitted_at":"2026-06-26T14:45:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes capability slices with dual taxonomies and mapping rules to form a closed loop converting benchmark failures into targeted data interventions, validated via two opposing case studies on BBH and math reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28438","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When AI Reviews Its Own Code: Recursive Self-Training Collapse in Code LLMs","primary_cat":"cs.SE","submitted_at":"2026-06-26T07:35:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Experiments across code LLMs show no-review collapses fastest, human-gated filters slow collapse, and AI self-gates lose effect over time, degenerating to ungated self-training under self-confirming acceptance as proven via gated distributional reweighting and spectral analysis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28434","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SWE-MeM: Learning Adaptive Memory Management for Long-Horizon Coding Agents","primary_cat":"cs.SE","submitted_at":"2026-06-26T04:55:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SWE-MeM introduces adaptive memory management for coding agents via synthesized trajectories and Memory-aware GRPO, reporting 43.4% and 60.2% resolve rates on SWE-Bench Verified for 4B and 30B models while beating baselines on performance and token use.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27550","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EntMTP: Accelerating LLM Inference with Entropy Guided Multi Token Prediction","primary_cat":"cs.CL","submitted_at":"2026-06-25T20:54:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EntMTP is a training-free entropy-guided scheduler for multi-token prediction that dynamically selects from task-specific Pareto-optimal trees to accelerate LLM inference by up to 1.36x on benchmarks without quality loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27443","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Does Personality Composition Matter for Multi-Agent LLM Teams?","primary_cat":"cs.AI","submitted_at":"2026-06-25T18:13:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical study finds that personality composition in multi-agent LLM teams affects performance in a task-dependent manner, with minimal impact on coding milestones but substantial degradation in collaboration and bargaining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27369","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement Learning without Ground-Truth Solutions can Improve LLMs","primary_cat":"cs.LG","submitted_at":"2026-06-25T17:59:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RiVER applies calibrated ranking rewards from execution scores to train LLMs on score-based tasks without ground-truth, producing gains on both heuristic contests and exact-solution coding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27359","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When are likely answers right? On Sequence Probability and Correctness in LLMs","primary_cat":"stat.ML","submitted_at":"2026-06-25T17:58:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Higher sequence probability predicts correctness across different answers in a dataset but does not reliably improve accuracy when decoding methods or hyperparameters are changed, nor does it indicate correctness for repeated responses to one prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00035","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making Failure Safe: A Constrained, Verifiable Agent Framework for Open-Web Data Collection","primary_cat":"cs.AI","submitted_at":"2026-06-25T14:05:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A constrained verifiable agent framework for open-web data collection achieves zero execution-stage LLM tokens and lowest wall-clock time on 80 verified tasks by shifting from free-form code to typed JSON configs with taxonomy and static DAG execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26979","ref_index":15,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Much Static Structure Do Code Agents Need? A Study of Deterministic Anchoring","primary_cat":"cs.SE","submitted_at":"2026-06-25T12:50:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An empirical study finds that injecting call/inheritance topology as comments improves LLM code agent localization by 2.2pp, shortens trajectories by 1.6 rounds, and halves run-to-run variance on medium repositories via a deterministic anchoring effect.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26978","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"To Run or Not to Run: Analyzing the Cost-Effectiveness of Code Execution in LLM-Based Program Repair","primary_cat":"cs.SE","submitted_at":"2026-06-25T12:49:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical analysis of LLM repair agents shows execution provides concentrated benefits, with restrictions causing only a 1.25 pp non-significant drop in resolve rate while cutting token and time costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26959","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Shift to Agentic AI: Evidence from Codex","primary_cat":"econ.GN","submitted_at":"2026-06-25T12:32:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Codex usage data from 2025-2026 show fivefold growth in active users, tenfold rise in complex-task requests, and 13-50x increases in monthly output tokens for legal and research roles.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26744","ref_index":6,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HyperDFlash: Hyper-Connection-Aligned Block Speculative Decoding with Gated Residual Reduction","primary_cat":"cs.LG","submitted_at":"2026-06-25T08:31:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"HyperDFlash improves speculative decoding for hyper-connection LLMs via pre-collapse residual conditioning and a lightweight gated reducer from the target hc_head, outperforming MTP and DFlash in draft acceptance and speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27406","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Evaluation of Implicit Software World Models in Coding LLMs","primary_cat":"cs.SE","submitted_at":"2026-06-25T08:02:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces evaluation of LLMs' implicit software world models via prediction of execution resources on real software tasks, finding modest and brittle performance across models including frontier ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26671","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NebulaExp-8B: An Empirical Post-Training Pipeline via Full-Scale Ablation Research","primary_cat":"cs.AI","submitted_at":"2026-06-25T07:03:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"NebulaExp reports an empirical post-training pipeline on Qwen3-8B that raises instruct scores from 55.01 to 61.85 and reasoning scores from 73.88 to 75.17 via curated data, SFT, GRPO RL, and OPD/MOPD distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26669","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SKILL-DISCO: Distilling and Compiling Agent Traces into Reusable Procedural Skills","primary_cat":"cs.AI","submitted_at":"2026-06-25T07:02:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SkillDisCo distills reusable PFSM subgraphs from successful agent traces and compiles them into callable procedural skills, improving success rates and reducing turns on ALFWorld and WebArena.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26590","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Empirical Software Engineering TerraProbe: A Layered-Oracle Framework for Detecting Deceptive Fixes in LLM-Assisted Terraform","primary_cat":"cs.LG","submitted_at":"2026-06-25T04:21:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TerraProbe shows that targeted Checkov removal overstates LLM Terraform repair success, with 71.4% of plan-compared real-world repairs being deceptive fixes that leave vulnerabilities intact.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26587","ref_index":94,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SharQ: Bridging Activation Sparsity and FP4 Quantization for LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-06-25T04:19:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SharQ combines input-adaptive N:M sparsity and FP4 quantization via sparse backbone plus dense residual, recovering 43-63% of the NVFP4-to-FP16 accuracy gap on Llama and Qwen models without calibration or retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26429","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DualEval: Joint Model-Item Calibration for Unified LLM Evaluation","primary_cat":"cs.LG","submitted_at":"2026-06-24T22:40:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DualEval jointly calibrates LLM abilities and item difficulties/sharpness in a shared latent space using static labels and reward-model scores to unify benchmark and arena-style evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26383","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SOLAR: AI-Powered Speed-of-Light Performance Analysis","primary_cat":"cs.LG","submitted_at":"2026-06-24T21:09:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOLAR automates derivation of validated speed-of-light performance bounds for deep learning models from source code via LLM-to-IR translation and analytical computation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26327","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EVOM: Agentic Meta-Evolution of Actor-Critic Architectures for Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-24T19:13:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EVOM is an LLM-powered agentic meta-evolution framework that discovers superior actor-critic architectures via bi-level optimization, outperforming baselines on Ant-v4 and HalfCheetah-v4.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26287","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeMoE: Gating Entropy is All You Need for Uncertainty-aware Adaptive Routing in MoE-based Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-24T18:34:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GeMoE adaptively sets the number of experts per token via gating entropy, retaining 99.5% of static-routing performance while raising average sparsity by 36.5%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26091","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Self-Distillation with Sampled Demonstrations Reduces Output Diversity","primary_cat":"cs.LG","submitted_at":"2026-06-24T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy self-distillation with sampled demonstrations reduces rollout diversity by amplifying existing probability gaps in the base model, unlike ideal RL which preserves ratios among correct outputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25879","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AI-Assisted Computational Reproducibility on the FABRIC Testbed","primary_cat":"cs.DC","submitted_at":"2026-06-24T14:23:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"AI coding assistants on the FABRIC testbed reproduced three experiments (congestion control, molecular dynamics, genomics) with 4-6x effort reduction but needed human help for analysis stages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25832","ref_index":54,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MiniOpt: Reasoning to Model and Solve General Optimization Problems with Limited Resources","primary_cat":"cs.LG","submitted_at":"2026-06-24T13:48:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MiniOpt trains LLMs under 10B parameters via RL with OptReward to model and solve general optimization problems, reporting highest average solving accuracy among comparable models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25747","ref_index":7,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CodeChat-Eval: Evaluating Large Language Models in Multi-Turn Code Refinement Dialogues","primary_cat":"cs.SE","submitted_at":"2026-06-24T12:16:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CodeChat-Eval shows LLMs lose 19.2% to 69.2% functional correctness over multi-turn refinement dialogues, with largest drops on logic-level and additive changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25561","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CrypFormBench: Benchmarking Formal Analysis Capability of Large Language Models for Cryptographic Schemes","primary_cat":"cs.CR","submitted_at":"2026-06-24T08:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CrypFormBench is a new benchmark jointly covering symbolic and computational security to evaluate LLMs on five formal analysis capabilities, with results showing top model Claude-3.5 scores 48.7/100 and most models struggling on generation, transformation, and correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25451","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning with a Single Rollout via Monte Carlo Pass@k Critic","primary_cat":"cs.LG","submitted_at":"2026-06-24T06:26:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SR-PPO trains a Pass@k critic from single-rollout Monte Carlo outcomes to enable token-level advantage estimation in language model RL, yielding stable training and Pass@128 gains on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25450","ref_index":9,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Generalization Spectrum: A Chromatographic Approach to Evaluating Learning Algorithms","primary_cat":"cs.LG","submitted_at":"2026-06-24T06:26:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the Generalization Spectrum evaluation framework to track per-example generalization across transfer distances in competitive programming tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25331","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improved Large Language Diffusion Models","primary_cat":"cs.CL","submitted_at":"2026-06-24T02:51:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"iLLaDA is an 8B masked diffusion LM trained from scratch with bidirectional attention, reporting gains of 14-21 points on BBH, ARC, MATH and HumanEval over prior diffusion models while remaining competitive with Qwen2.5-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}