Update examples (auto GPU split)

2026-04-20 14:29:28 +00:00 · 2023-10-22 19:32:26 +02:00
parent 7f35594a54
commit 7a783b3824
5 changed files with 30 additions and 21 deletions
--- a/examples/chat.py
+++ b/examples/chat.py
@@ -78,9 +78,9 @@ if system_prompt is None: system_prompt = prompt_format.default_system_prompt()

 model_init.check_args(args)
 model_init.print_options(args)
-model, tokenizer = model_init.init(args)
+model, tokenizer = model_init.init(args, allow_auto_split = True)

-# Initialize draft model if provided
+# Initialize draft model if provided, assume it always fits on first device

 draft_model = None
 draft_cache = None
@@ -119,9 +119,16 @@ if args.draft_model_dir:
 # Create cache

 if args.cache_8bit:
-    cache = ExLlamaV2Cache_8bit(model)
+    cache = ExLlamaV2Cache_8bit(model, lazy = not model.loaded)
 else:
-    cache = ExLlamaV2Cache(model)
+    cache = ExLlamaV2Cache(model, lazy = not model.loaded)
+
+# Load model now if auto split enabled
+
+if not model.loaded:
+
+    print(" -- Loading model...")
+    model.load_autosplit(cache)

 # Chat context

--- a/examples/inference.py
+++ b/examples/inference.py
@@ -27,14 +27,11 @@ config.prepare()
 model = ExLlamaV2(config)
 print("Loading model: " + model_directory)

-# allocate 18 GB to CUDA:0 and 24 GB to CUDA:1.
-# (Call `model.load()` if using a single GPU.)
-model.load([18, 24])
+cache = ExLlamaV2Cache(model, lazy = True)
+model.load_autosplit(cache)

 tokenizer = ExLlamaV2Tokenizer(config)

-cache = ExLlamaV2Cache(model)
-
 # Initialize generator

 generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
--- a/examples/speculative.py
+++ b/examples/speculative.py
@@ -32,13 +32,11 @@ draft_config.prepare()
 draft_config.max_seq_len = 2048

 draft = ExLlamaV2(draft_config)
-draft.load([24, 0])
-
 model = ExLlamaV2(model_config)
-model.load([14, 24])
-
-model_cache = ExLlamaV2Cache(model)
-draft_cache = ExLlamaV2Cache(draft)
+model_cache = ExLlamaV2Cache(model, lazy = True)
+draft_cache = ExLlamaV2Cache(draft, lazy = True)
+draft.load_autosplit(draft_cache)
+model.load_autosplit(model_cache)

 tokenizer = ExLlamaV2Tokenizer(model_config)

--- a/examples/streaming.py
+++ b/examples/streaming.py
@@ -26,12 +26,12 @@ config.prepare()

 model = ExLlamaV2(config)
 print("Loading model: " + model_directory)
-model.load([16, 24])
+
+cache = ExLlamaV2Cache(model, lazy = True)
+model.load_autosplit(cache)

 tokenizer = ExLlamaV2Tokenizer(config)

-cache = ExLlamaV2Cache(model)
-
 # Initialize generator

 generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
--- a/examples/ws_server.py
+++ b/examples/ws_server.py
@@ -25,11 +25,18 @@ model_init.add_args(parser)
 args = parser.parse_args()
 model_init.check_args(args)
 model_init.print_options(args)
-model, tokenizer = model_init.init(args)
+model, tokenizer = model_init.init(args, allow_auto_split = True)

-# Create cache
+# Load model after cache if --gpu_split auto

-cache = ExLlamaV2Cache(model)
+if not model.loaded:
+    cache = ExLlamaV2Cache(model, lazy = True)
+    model.load_autosplit(cache)
+
+# Else create cache
+
+else:
+    cache = ExLlamaV2Cache(model)

 # Create server