{"total":40,"items":[{"citing_arxiv_id":"2605.13292","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IndicMedDialog: A Parallel Multi-Turn Medical Dialogue Dataset for Accessible Healthcare in Indic Languages","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:06:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A parallel multi-turn medical dialogue dataset spanning English and nine Indic languages is created from synthetic consultations to enable personalized AI healthcare interactions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12022","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAGE: Scalable Automated Robustness Augmentation for LLM Knowledge Evaluation","primary_cat":"cs.CL","submitted_at":"2026-05-12T12:09:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SAGE trains a rubric-based verifier and an RL-optimized generator on seed human data to scalably augment LLM knowledge benchmarks, matching human-annotated quality on HellaSwag at lower cost and generalizing to MMLU.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11365","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Causal Bias Detection in Generative Artifical Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-12T00:36:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A causal framework unifies fairness analysis across generative AI and standard ML by deriving decompositions that separate biases along causal pathways and differences between real-world and model mechanisms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10639","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Navigating the Sea of LLM Evaluation: Investigating Bias in Toxicity Benchmarks","primary_cat":"cs.AI","submitted_at":"2026-05-11T14:27:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Toxicity benchmarks for LLMs produce inconsistent results when task type, input domain, or model changes, revealing intrinsic evaluation biases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10577","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training continuously-coupled reconfigurable photonic chips with quantum machine learning","primary_cat":"quant-ph","submitted_at":"2026-05-11T13:49:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A black-box machine learning technique trains continuously-coupled photonic waveguide arrays to implement target unitaries using limited single- and two-photon measurements without requiring detailed internal models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09154","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Predicting Large Model Test Losses with a Noisy Quadratic System","primary_cat":"cs.LG","submitted_at":"2026-05-09T20:35:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A noisy quadratic system predicts large model test losses from N, B, K and outperforms Chinchilla's model for extrapolation up to 1000x compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08615","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DSPE: An Energy-Efficient Edge Processor for DeepSeek Inference with MerkleTree-based Incremental Pruning, Multi-Stage Boothing Lookup and Dynamic Adaptive Posit Processing","primary_cat":"cs.AR","submitted_at":"2026-05-09T02:18:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DSPE is an edge processor that achieves 109.4 TFLOPS/W for DeepSeek inference using Merkle tree-based incremental pruning, multi-stage boothing lookup, and dynamic adaptive posit processing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07379","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RELO: Reinforcement Learning to Localize for Visual Object Tracking","primary_cat":"cs.CV","submitted_at":"2026-05-08T07:34:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RELO replaces handcrafted spatial priors with a reinforcement learning policy for target localization in visual tracking and reports 57.5% AUC on LaSOText without template updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06992","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Why Does Agentic Safety Fail to Generalize Across Tasks?","primary_cat":"cs.LG","submitted_at":"2026-05-07T22:16:03+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agentic safety fails to generalize across tasks because the task-to-safe-controller mapping has a higher Lipschitz constant than the task-to-controller mapping alone, as proven in linear-quadratic control and demonstrated in quadcopter and LLM experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04920","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reinforcement Learning for Compositional Generalization with Outcome-Level Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-06T13:47:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Outcome-level RL with binary or composite rewards improves compositional generalization over supervised fine-tuning by avoiding overfitting to frequent training patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02364","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InfoLaw: Information Scaling Laws for Large Language Models with Quality-Weighted Mixture Data and Repetition","primary_cat":"cs.CL","submitted_at":"2026-05-04T09:07:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InfoLaw models pretraining as information accumulation where quality sets information density and repetition causes scale-dependent diminishing returns, predicting loss with low error on unseen mixtures and larger scales up to 7B models and 425B tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00419","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking LLM Ensembling from the Perspective of Mixture Models","primary_cat":"cs.LG","submitted_at":"2026-05-01T05:31:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ME reinterprets LLM ensembling as a mixture model by sampling a single model stochastically at each token step, matching the ensemble distribution while invoking only one model per step for substantial speed gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21357","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ReaGeo: Reasoning-Enhanced End-to-End Geocoding with LLMs","primary_cat":"cs.AI","submitted_at":"2026-04-23T07:18:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReaGeo is an end-to-end LLM framework for geocoding that uses geohash text generation, Chain-of-Thought spatial reasoning, and distance-based RL to accurately predict points and regions from explicit and vague queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18753","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Handling and Interpreting Missing Modalities in Patient Clinical Trajectories via Autoregressive Sequence Modeling","primary_cat":"cs.LG","submitted_at":"2026-04-20T18:55:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Autoregressive transformer modeling with missingness-aware contrastive pre-training outperforms baselines on MIMIC-IV and eICU benchmarks and mitigates divergent behavior from removed modalities in clinical trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18264","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Universally Empowering Zeroth-Order Optimization via Adaptive Layer-wise Sampling","primary_cat":"cs.LG","submitted_at":"2026-04-20T13:37:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdaLeZO uses a non-stationary multi-armed bandit to adaptively allocate perturbation budget across layers in zeroth-order optimization and applies inverse probability weighting to reduce variance while preserving unbiased gradients, delivering 1.7x-3.0x wall-clock speedup on LLaMA and OPT models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15280","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Why Do Vision Language Models Struggle To Recognize Human Emotions?","primary_cat":"cs.CV","submitted_at":"2026-04-16T17:49:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VLMs fail at dynamic facial expression recognition because web-scale pretraining exacerbates long-tailed class bias and sparse frame sampling misses micro-expressions; a multi-stage context enrichment strategy using language summaries of skipped frames is proposed to mitigate this.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15009","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Faster Language Model Inference Using Mixture-of-Experts Flow Matching","primary_cat":"cs.AI","submitted_at":"2026-04-16T13:36:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mixture-of-experts flow matching enables non-autoregressive language models to achieve autoregressive-level quality in three sampling steps, delivering up to 1000x faster inference than diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13549","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reconstruction of a 3D wireframe from a single line drawing via generative depth estimation","primary_cat":"cs.CV","submitted_at":"2026-04-15T06:52:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A latent diffusion model conditioned on line drawings estimates dense depth to reconstruct 3D wireframes, reporting 5.3% average depth error after training on over one million pairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13417","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Cognitive Circuit Breaker: A Systems Engineering Framework for Intrinsic AI Reliability","primary_cat":"cs.SE","submitted_at":"2026-04-15T02:34:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The Cognitive Circuit Breaker detects LLM hallucinations by computing the Cognitive Dissonance Delta between semantic confidence and latent certainty from hidden states, adding negligible overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13413","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Dataset-Level Metrics Attenuate Non-Determinism: A Fine-Grained Non-Determinism Evaluation in Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-15T02:31:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dataset-level metrics in diffusion language models mask substantial sample-level non-determinism that varies with model and system factors, which a new Factor Variance Attribution metric can decompose.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09995","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic Application in Power Grid Static Analysis: Automatic Code Generation and Error Correction","primary_cat":"eess.SY","submitted_at":"2026-04-11T02:56:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"An LLM agent with static pre-check, dynamic feedback, and semantic validation generates MATPOWER code from natural language for power grid analysis at 82.38% fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07891","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AFGNN: API Misuse Detection using Graph Neural Networks and Clustering","primary_cat":"cs.SE","submitted_at":"2026-04-09T07:01:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AFGNN detects API misuses in Java code more effectively than prior methods by representing usage as graphs and clustering learned embeddings from self-supervised training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06998","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Identifying Topological Invariants of Non-Hermitian Systems via Domain-Adaptive Multimodal Model for Mathematics","primary_cat":"cond-mat.other","submitted_at":"2026-04-08T12:15:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A multimodal model with Qwen Math backbone identifies topological invariants of non-Hermitian systems from eigenvalues and eigenvectors in momentum space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07403","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RefineRAG: Word-Level Poisoning Attacks via Retriever-Guided Text Refinement","primary_cat":"cs.CR","submitted_at":"2026-04-08T10:33:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RefineRAG achieves 90% attack success on NQ by generating toxic seeds then optimizing them via retriever-in-the-loop word refinement, outperforming prior methods on effectiveness and naturalness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03298","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ENEC: A Lossless AI Model Compression Method Enabling Fast Inference on Ascend NPUs","primary_cat":"cs.AR","submitted_at":"2026-03-28T16:11:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ENEC delivers 3.43X higher throughput than DietGPU and 1.12X better compression ratio than nvCOMP for lossless model weight compression on Ascend NPUs, yielding up to 6.3X end-to-end inference speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.09567","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Reasoning Era: A Survey of Long Chain-of-Thought for Reasoning Large Language Models","primary_cat":"cs.AI","submitted_at":"2025-03-12T17:35:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper unifies perspectives on Long CoT in reasoning LLMs by introducing a taxonomy, detailing characteristics of deep reasoning and reflection, and discussing emergence phenomena and future directions.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":",DeepSeek-R1 [227],Kimi-k1.5 [722], T1 [264],ReST-EM [674],SWE-RL [841], DeepScaleR [518],ReST-MCTS* [1032], rSTaR-Math [222], Logic-RL [886], OREAL [522], StepCoder [161], RLSP [962],Verifier [141], TS-LLM [755], STeCa [768], OREO[773], Chu et al. [137], Shen et al. [661],etc. External Exploration(§6.3) Human-drivenExploration e.g.,SPaR [118], Forest-of-thought [54],Scattered ForestSearch [448],Kang et al. [339],AlphaLLM [737],PATHFINDER [213],Least-to-Most [1117], ToT [955], TreeBoN [625], CodeTree [400], Tree-of-Code[565] TouT [556], GoT [48], GraphReason [64], Besta et al. [49], AoT [733], Chen et al. [95],etc. Model-drivenExploration e.g.,DBS [1142], Lehnert et al. [378], MindSTaR [338], Residual-EBM [901], Mulberry [952], C-MCTS[453], PPO-MCTS [478], Llama-Berry [1034], Marco-o1 [1095], AtomThink [879], Puri et al."},{"citing_arxiv_id":"2502.16982","ref_index":93,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Muon is Scalable for LLM Training","primary_cat":"cs.LG","submitted_at":"2025-02-24T09:12:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Muon optimizer with weight decay and update scaling achieves ~2x efficiency over AdamW for large LLMs, shown via the Moonlight 3B/16B MoE model trained on 5.7T tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.09992","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Large Language Diffusion Models","primary_cat":"cs.CL","submitted_at":"2025-02-14T08:23:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LLaDA is a scalable diffusion-based language model that matches autoregressive LLMs like LLaMA3 8B on tasks and surpasses GPT-4o on reversal poem completion.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Block diffusion: Interpolating between autoregressive and diffusion language models.arXiv preprint arXiv:2503.09573, 2025. [32] Xiao Bi, Deli Chen, Guanting Chen, Shanhuang Chen, Damai Dai, Chengqi Deng, Honghui Ding, Kai Dong, Qiushi Du, Zhe Fu, et al. Deepseek llm: Scaling open-source language models with longtermism.arXiv preprint arXiv:2401.02954, 2024. [33] Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b.arXiv preprint arXiv:2310.06825, 2023. [34] Tian Ye, Zicheng Xu, Yuanzhi Li, and Zeyuan Allen-Zhu. Physics of Language Models: Part 2.1, Grade-School Math and the Hidden Reasoning Process."},{"citing_arxiv_id":"2501.17811","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Janus-Pro: Unified Multimodal Understanding and Generation with Data and Model Scaling","primary_cat":"cs.AI","submitted_at":"2025-01-29T18:00:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Scaling data, model size, and training optimization on the Janus architecture yields better multimodal understanding and more stable, instruction-following text-to-image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.06525","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation","primary_cat":"cs.CV","submitted_at":"2024-06-10T17:59:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Scaled vanilla autoregressive models based on Llama achieve 2.18 FID on ImageNet 256x256 image generation, beating popular diffusion models without visual inductive biases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.00515","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey on Large Language Models for Code Generation","primary_cat":"cs.CL","submitted_at":"2024-06-01T17:48:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A systematic literature review that organizes recent work on LLMs for code generation into a taxonomy covering data curation, model advances, evaluations, ethics, environmental impact, and applications, with benchmark comparisons.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.04434","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","primary_cat":"cs.CL","submitted_at":"2024-05-07T15:56:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepSeek-V2 delivers top-tier open-source LLM performance using only 21B active parameters by compressing the KV cache 93.3% and cutting training costs 42.5% via MLA and DeepSeekMoE.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"PROMPT Problem: Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.} Solution: The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$. Final Answer: The final answer is $[2,5)$. I hope it is correct. Problem: If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$ Solution: We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$ Final Answer: The final answer is $24$. I hope it is correct. Problem: Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? Solution: If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{align*} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{16} \\end{align*} Final Answer: The final answer is $16$. I hope it is correct. Problem: If the system of equations \\begin{align*} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero. Solution: If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$ Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct. Problem: Evaluate $\\log_21$. Solution: Table 27 | An example of MATH. 45 PROMPT You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given tw"},{"citing_arxiv_id":"2404.16821","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","primary_cat":"cs.CV","submitted_at":"2024-04-25T17:59:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"InternVL 1.5 narrows the performance gap to proprietary multimodal models via a stronger transferable vision encoder, dynamic high-resolution tiling, and curated English-Chinese training data.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[6] Yuelin Bai, Xinrun Du, Yiming Liang, Yonggang Jin, Ziqiang Liu, Junting Zhou, Tianyu Zheng, Xincheng Zhang, Nuo Ma, Zekun Wang, et al. Coig-cqia: Quality is all you need for chinese instruction fine-tuning. arXiv preprint arXiv:2403.18058, 2024. 5, 6 [7] Baichuan. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305, 2023. 3 [8] Xiao Bi, Deli Chen, Guanting Chen, Shanhuang Chen, Damai Dai, Chengqi Deng, Honghui Ding, Kai Dong, Qiushi Du, Zhe Fu, et al. Deepseek llm: Scaling open- source language models with longtermism. arXiv preprint arXiv:2401.02954, 2024. 3 [9] Ali Furkan Biten, Ruben Tito, Andres Mafla, Lluis Gomez, Marc ¸al Rusinol, Ernest Valveny, CV Jawahar, and Dimos-"},{"citing_arxiv_id":"2404.06395","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies","primary_cat":"cs.CL","submitted_at":"2024-04-09T15:36:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MiniCPM 1.2B and 2.4B models reach parity with 7B-13B LLMs via model wind-tunnel scaling and a WSD scheduler that yields a higher optimal data-to-model ratio than Chinchilla scaling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.20330","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","primary_cat":"cs.CV","submitted_at":"2024-03-29T17:59:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Current LVLM benchmarks overestimate capabilities because many questions can be answered without images due to design flaws or data leakage; MMStar is a human-curated set of 1,500 vision-indispensable samples across 6 capabilities and 18 axes with new metrics for leakage and true multi-modal gain.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07974","ref_index":245,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","primary_cat":"cs.SE","submitted_at":"2024-03-12T17:58:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiveCodeBench collects 400 recent contest problems to create a contamination-free benchmark evaluating LLMs on code generation and related capabilities like self-repair and execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.05525","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepSeek-VL: Towards Real-World Vision-Language Understanding","primary_cat":"cs.AI","submitted_at":"2024-03-08T18:46:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DeepSeek-VL develops open-source 1.3B and 7B vision-language models that achieve competitive or state-of-the-art results on real-world visual-language benchmarks through diverse data curation, a hybrid vision encoder, and pretraining that preserves language capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.14196","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DeepSeek-Coder: When the Large Language Model Meets Programming -- The Rise of Code Intelligence","primary_cat":"cs.SE","submitted_at":"2024-01-25T14:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeepSeek-Coder open-source models trained on 2T code tokens with fill-in-the-blank pretraining achieve SOTA results among open models and surpass closed-source Codex and GPT-3.5 on code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.06066","ref_index":130,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models","primary_cat":"cs.CL","submitted_at":"2024-01-11T17:31:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeepSeekMoE 2B matches GShard 2.9B performance and approaches a dense 2B model; the 16B version matches LLaMA2-7B at 40% compute by using fine-grained expert segmentation plus shared experts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.02385","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TinyLlama: An Open-Source Small Language Model","primary_cat":"cs.CL","submitted_at":"2024-01-04T17:54:59+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TinyLlama is a 1.1B-parameter open-source language model pretrained on 1 trillion tokens that outperforms other open-source models of similar size on downstream tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}