mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 14:29:28 +00:00
Fast safetensors mode with direct IO and pinned buffer
This commit is contained in:
@@ -81,7 +81,7 @@ if args.stream_layers:
|
||||
|
||||
model_init.check_args(args)
|
||||
model_init.print_options(args)
|
||||
model, tokenizer = model_init.init(args, allow_auto_split = True, skip_load = args.stream_layers)
|
||||
model, tokenizer = model_init.init(args, allow_auto_split = True, skip_load = args.stream_layers, benchmark = True)
|
||||
cache = None
|
||||
|
||||
# Auto split
|
||||
@@ -93,7 +93,10 @@ if not model.loaded and not args.stream_layers:
|
||||
|
||||
print(" -- Loading model...")
|
||||
cache = ExLlamaV2Cache(model, lazy = True)
|
||||
t = time.time()
|
||||
model.load_autosplit(cache)
|
||||
t = time.time() - t
|
||||
print(f" -- Loaded model in {t:.4f} seconds")
|
||||
|
||||
if args.stream_layers:
|
||||
|
||||
|
||||
Reference in New Issue
Block a user