mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 14:29:28 +00:00
Added --gpu_split explanation on test_inference.py
This commit is contained in:
committed by
GitHub
parent
4ca5ca35a6
commit
2ee57974dc
@@ -54,6 +54,7 @@ parser.add_argument("-nwu", "--no_warmup", action = "store_true", help = "Skip w
|
||||
parser.add_argument("-sl", "--stream_layers", action = "store_true", help = "Load model layer by layer (perplexity evaluation only)")
|
||||
parser.add_argument("-sp", "--standard_perplexity", choices = ["wiki2"], help = "Run standard (HF) perplexity test, stride 512 (experimental)")
|
||||
parser.add_argument("-rr", "--rank_reduce", type = str, help = "Rank-reduction for MLP layers of model, in reverse order (for experimentation)")
|
||||
parser.add_argument("-gs", "--gpu_split", type = str, help = "Split the model between multiple GPUs. Use `--gpu_split auto` for automatic handling or `--gpu_split x,y` to manually assign the VRAM on each device")
|
||||
|
||||
# Initialize model and tokenizer
|
||||
|
||||
|
||||
Reference in New Issue
Block a user