{"total":11,"items":[{"citing_arxiv_id":"2605.12147","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"PrivacySIM: Evaluating LLM Simulation of User Privacy Behavior","primary_cat":"cs.CR","submitted_at":"2026-05-12T14:05:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PrivacySIM shows that conditioning LLMs on user personas like demographics and attitudes improves simulation of privacy choices but reaches only 40.4% accuracy against real responses from 1,000 users.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11653","ref_index":21,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Every Bit, Everywhere, All at Once: A Binomial Multibit LLM Watermark","primary_cat":"cs.CR","submitted_at":"2026-05-12T07:14:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A binomial multibit watermarking scheme encodes every payload bit at each LLM token with dynamic redirection, outperforming baselines in accuracy and robustness for large payloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11128","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Sampling More, Getting Less: Calibration is the Diversity Bottleneck in LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-11T18:36:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Diversity collapse in LLMs arises from order and shape miscalibration in token probability distributions at inference time, not from sampling methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10325","ref_index":12,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Verifiable Process Rewards for Agentic Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-11T10:30:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Verifiable Process Rewards (VPR) converts symbolic oracles into dense turn-level supervision for reinforcement learning in agentic reasoning, outperforming outcome-only rewards and transferring to general benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10168","ref_index":43,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"ASTRA-QA: A Benchmark for Abstract Question Answering over Documents","primary_cat":"cs.CL","submitted_at":"2026-05-11T08:17:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ASTRA-QA is a benchmark for abstract document question answering that uses explicit topic sets, unsupported content annotations, and evidence alignments to enable direct scoring of coverage and hallucination.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09860","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"When to Re-Commit: Temporal Abstraction Discovery for Long-Horizon Vision-Language Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-11T01:43:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"State-conditioned commitment depth in a vision-language policy Pareto-dominates fixed-depth baselines on Sliding Puzzle and Sokoban, raising solve rates by up to 12.5 points while using 25% fewer actions and beating larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09806","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"LEAD: Length-Efficient Adaptive and Dynamic Reasoning for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-10T23:05:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LEAD uses online adaptive mechanisms including Potential-Scaled Instability and symmetric efficiency rewards based on correct rollouts to achieve higher accuracy-efficiency scores with substantially shorter reasoning outputs than base models on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08755","ref_index":26,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"LAQuant: A Simple Overhead-free Large Reasoning Model Quantization by Layer-wise Lookahead Loss","primary_cat":"cs.LG","submitted_at":"2026-05-09T07:35:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LAQuant improves long-decoding accuracy on quantized reasoning models like Qwen3-4B by 15pp on AIME25 via layer-wise lookahead loss, achieving 3.42x speedup over FP16.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07985","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Dooly: Configuration-Agnostic, Redundancy-Aware Profiling for LLM Inference Simulation","primary_cat":"cs.DC","submitted_at":"2026-05-08T16:44:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Dooly reduces LLM inference profiling costs by 56.4% via configuration-agnostic taint-based labeling and selective database reuse, delivering simulation accuracy within 5% MAPE for TTFT and 8% for TPOT across 12 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07721","ref_index":51,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Memory-Efficient Looped Transformer: Decoupling Compute from Memory in Looped Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-08T13:25:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MELT decouples reasoning depth from memory in looped LLMs by sharing a single gated KV cache per layer and using two-phase chunk-wise distillation from Ouro, delivering constant memory use while matching or beating standard LLM performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07579","ref_index":14,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Your Language Model is Its Own Critic: Reinforcement Learning with Value Estimation from Actor's Internal States","primary_cat":"cs.LG","submitted_at":"2026-05-08T10:49:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"POISE trains a lightweight probe on the actor's internal states to predict expected rewards for RLVR, matching DAPO performance on math benchmarks with lower compute by avoiding extra rollouts or critic models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}