diff --git a/exllamav2/model.py b/exllamav2/model.py
index 384a789..7296b6e 100644
--- a/exllamav2/model.py
+++ b/exllamav2/model.py
@@ -21,12 +21,16 @@ os.environ["CUDA_MODULE_LOADING"] = "LAZY"
 #         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
 
 import torch
-
 if not (torch.version.cuda or torch.version.hip):
     print("")
     print(f" ## Warning: The installed version of PyTorch is {torch.__version__} and does not support CUDA or ROCm.")
     print("")
 
+# PyTorch, especially v2.3.1, gets confused when working with small CPU tensors and likes to use way too many worker
+# threads for small operations, adding considerable overhead. Limit it to a single thread to avoid that (globally
+# because that seems to be the only way)
+torch.set_num_threads(1)
+
 import math
 from exllamav2.config import ExLlamaV2Config
 from exllamav2.cache import ExLlamaV2CacheBase