Globally set Torch number of threads to 1

2026-04-19 22:08:55 +00:00 · 2024-06-17 00:39:16 +02:00
parent 5b1b8d4169
commit c2aac982e4
1 changed files with 5 additions and 1 deletions
--- a/exllamav2/model.py
+++ b/exllamav2/model.py
@@ -21,12 +21,16 @@ os.environ["CUDA_MODULE_LOADING"] = "LAZY"
 #         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"

 import torch
-
 if not (torch.version.cuda or torch.version.hip):
    print("")
    print(f" ## Warning: The installed version of PyTorch is {torch.__version__} and does not support CUDA or ROCm.")
    print("")

+# PyTorch, especially v2.3.1, gets confused when working with small CPU tensors and likes to use way too many worker
+# threads for small operations, adding considerable overhead. Limit it to a single thread to avoid that (globally
+# because that seems to be the only way)
+torch.set_num_threads(1)
+
 import math
 from exllamav2.config import ExLlamaV2Config
 from exllamav2.cache import ExLlamaV2CacheBase