{"total":11,"items":[{"citing_arxiv_id":"2607.01874","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillCoach: Self-Evolving Rubrics for Evaluating and Enhancing Agentic Skill-Use","primary_cat":"cs.AI","submitted_at":"2026-07-02T08:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillCoach introduces self-evolving rubrics derived from rollouts to evaluate and supervise four process dimensions of agentic skill-use separately from outcome success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27593","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Odyssey: Constructing Verifiable Local Truth-Preserving Foundation Models","primary_cat":"cs.AI","submitted_at":"2026-06-25T22:49:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"ODYSSEY is a sheaf-theoretic framework for building verifiable foundation models as compositions of foundries via left and right Kan extensions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24311","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LemonHarness Technical Report","primary_cat":"cs.AI","submitted_at":"2026-06-23T08:44:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LemonHarness constrains LLM agent state changes to a defined workspace, supplies callable rule knowledge, and adds time awareness, yielding 84.49% and 86.52% accuracy on Terminal-Bench 2.0 with two GPT-5 backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20475","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Marginal Advantage Accumulation for Memory-Driven Agent Self-Evolution","primary_cat":"cs.LG","submitted_at":"2026-06-18T16:54:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAA formalizes alignability and comparability conditions and uses differential signals, EMA accumulation, and semantic identity merging to enable cross-batch operation-level evidence accumulation, outperforming batch-level baselines in 14 of 16 settings while matching online methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20333","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SoftSkill: Behavioral Compression for Contextual Adaptation","primary_cat":"cs.AI","submitted_at":"2026-06-18T15:04:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SoftSkill compresses agent skills into length-32 continuous prefixes via next-token training of soft deltas, yielding 5.2-12.5 point gains over SkillOpt on SearchQA and LiveMath while using far fewer tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17819","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Framework for Evaluating Agentic Skills at Scale","primary_cat":"cs.SE","submitted_at":"2026-06-16T11:46:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The authors developed an evaluation framework that generates 1000 tasks from 500 real-world agent skills, applies instruction-following and goal-completion rubrics, and benchmarks 19 proprietary and open-source model configurations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11543","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillJuror: Measuring How Agent Skill Organization Changes Runtime Behavior","primary_cat":"cs.AI","submitted_at":"2026-06-10T01:11:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical study finds Progressive Disclosure raises distinct resources touched (1.18 to 3.85) and uptake events (1.33 to 3.92) per trajectory, adds 17 passing trials out of 410 (+4.1%), with gains task-dependent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09774","ref_index":31,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Auto-Configuring Scientific Simulators with Lightweight Coding-Agent Adapters","primary_cat":"cs.AI","submitted_at":"2026-06-08T17:35:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIGA is a coding-agent adapter using retrieval, procedural memory, and validation gates that raises success rate on GEOS from 0.720 to 0.789 while cutting variance 16x and matching expert quality in minutes instead of hours.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05395","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VASO: Formally Verifiable Self-Evolving Skills for Physical AI Agents","primary_cat":"cs.RO","submitted_at":"2026-06-03T20:02:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VASO is a verification-guided self-evolution framework for LLM robot skill contracts that reaches 97.2% formal-specification compliance on Jackal and quadcopter tasks using under 100 samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01311","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillAdaptor: Self-Adapting Skills for LLM Agents from Trajectories","primary_cat":"cs.CL","submitted_at":"2026-05-31T16:00:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillAdaptor introduces step-level failure attribution and targeted skill updates for LLM agents, yielding performance gains on WebShop, PinchBench, and Claw-Eval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27328","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Governed Evolution of Agent Runtimes through Executable Operational Cognition","primary_cat":"cs.SE","submitted_at":"2026-05-26T17:36:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Introduces HarnessMutation as a governed mechanism for lifecycle-aware runtime adaptation in agent systems, modeling evolution as a bounded observable process over persistent operational memory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}