Update examples (auto GPU split)

This commit is contained in:
turboderp
2023-10-22 19:32:26 +02:00
parent 7f35594a54
commit 7a783b3824
5 changed files with 30 additions and 21 deletions

View File

@@ -78,9 +78,9 @@ if system_prompt is None: system_prompt = prompt_format.default_system_prompt()
model_init.check_args(args)
model_init.print_options(args)
model, tokenizer = model_init.init(args)
model, tokenizer = model_init.init(args, allow_auto_split = True)
# Initialize draft model if provided
# Initialize draft model if provided, assume it always fits on first device
draft_model = None
draft_cache = None
@@ -119,9 +119,16 @@ if args.draft_model_dir:
# Create cache
if args.cache_8bit:
cache = ExLlamaV2Cache_8bit(model)
cache = ExLlamaV2Cache_8bit(model, lazy = not model.loaded)
else:
cache = ExLlamaV2Cache(model)
cache = ExLlamaV2Cache(model, lazy = not model.loaded)
# Load model now if auto split enabled
if not model.loaded:
print(" -- Loading model...")
model.load_autosplit(cache)
# Chat context

View File

@@ -27,14 +27,11 @@ config.prepare()
model = ExLlamaV2(config)
print("Loading model: " + model_directory)
# allocate 18 GB to CUDA:0 and 24 GB to CUDA:1.
# (Call `model.load()` if using a single GPU.)
model.load([18, 24])
cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)
tokenizer = ExLlamaV2Tokenizer(config)
cache = ExLlamaV2Cache(model)
# Initialize generator
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)

View File

@@ -32,13 +32,11 @@ draft_config.prepare()
draft_config.max_seq_len = 2048
draft = ExLlamaV2(draft_config)
draft.load([24, 0])
model = ExLlamaV2(model_config)
model.load([14, 24])
model_cache = ExLlamaV2Cache(model)
draft_cache = ExLlamaV2Cache(draft)
model_cache = ExLlamaV2Cache(model, lazy = True)
draft_cache = ExLlamaV2Cache(draft, lazy = True)
draft.load_autosplit(draft_cache)
model.load_autosplit(model_cache)
tokenizer = ExLlamaV2Tokenizer(model_config)

View File

@@ -26,12 +26,12 @@ config.prepare()
model = ExLlamaV2(config)
print("Loading model: " + model_directory)
model.load([16, 24])
cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)
tokenizer = ExLlamaV2Tokenizer(config)
cache = ExLlamaV2Cache(model)
# Initialize generator
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)

View File

@@ -25,11 +25,18 @@ model_init.add_args(parser)
args = parser.parse_args()
model_init.check_args(args)
model_init.print_options(args)
model, tokenizer = model_init.init(args)
model, tokenizer = model_init.init(args, allow_auto_split = True)
# Create cache
# Load model after cache if --gpu_split auto
cache = ExLlamaV2Cache(model)
if not model.loaded:
cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)
# Else create cache
else:
cache = ExLlamaV2Cache(model)
# Create server