mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 14:29:28 +00:00
Add RoPE arguments to quantizer script
This commit is contained in:
22
convert.py
22
convert.py
@@ -27,6 +27,8 @@ parser.add_argument("-b", "--bits", type = float, default = 4.125, help = "Targe
|
||||
parser.add_argument("-hb", "--head_bits", type = int, default = 6, help = "Target bits per weight (head layer)")
|
||||
parser.add_argument("-m", "--measurement", type = str, help = "Reuse previous measurement")
|
||||
parser.add_argument("-ss", "--shard_size", type = float, help = "Max shard size in MB (default: 8192)", default = 8192)
|
||||
parser.add_argument("-rs", "--rope_scale", type = float, default = 1.0, help = "RoPE scaling factor")
|
||||
parser.add_argument("-ra", "--rope_alpha", type = float, default = 1.0, help = "RoPE alpha value (NTK)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -77,6 +79,8 @@ output_measurement = args.output_measurement
|
||||
if output_measurement is not None:
|
||||
if os.path.isdir(output_measurement):
|
||||
output_measurement = os.path.join(output_measurement, "measurement.json")
|
||||
rope_scale = args.rope_scale
|
||||
rope_alpha = args.rope_alpha
|
||||
|
||||
compile_full = args.compile_full
|
||||
|
||||
@@ -142,7 +146,9 @@ if no_resume or not os.path.exists(job_file):
|
||||
"progress": "begin",
|
||||
"shard_size": shard_size,
|
||||
"output_measurement": output_measurement,
|
||||
"compile_full": compile_full
|
||||
"compile_full": compile_full,
|
||||
"rope_scale": rope_scale,
|
||||
"rope_alpha": rope_alpha
|
||||
}
|
||||
|
||||
if reuse_measurement is not None:
|
||||
@@ -190,6 +196,12 @@ else:
|
||||
print(f" -- Measurement will be saved to {job['output_measurement']}")
|
||||
print(f" !! Conversion script will end after measurement pass")
|
||||
|
||||
if job["rope_scale"] is not None:
|
||||
print(f" -- RoPE scale: {job['rope_scale']:.2f}")
|
||||
|
||||
if job["rope_alpha"] is not None:
|
||||
print(f" -- RoPE alpha: {job['rope_alpha']:.2f}")
|
||||
|
||||
# Make sure subfolders exist
|
||||
|
||||
if job["compile_full"] is not None:
|
||||
@@ -211,6 +223,14 @@ max_l = max(job["measurement_length"], job["length"])
|
||||
config.max_input_len = max_l
|
||||
config.max_attention_size = max_l ** 2
|
||||
|
||||
# Set scaling for input model
|
||||
|
||||
if job["rope_scale"] is not None:
|
||||
config.scale_pos_emb = job["rope_scale"]
|
||||
|
||||
if job["rope_alpha"] is not None:
|
||||
config.scale_alpha_value = job["rope_alpha"]
|
||||
|
||||
# Create model without loading weights
|
||||
|
||||
model = ExLlamaV2(config)
|
||||
|
||||
@@ -68,10 +68,18 @@ to run without swapping for smaller models and have to set **-gr** to zero for l
|
||||
- **-hb / --bits *int***: Number of bits for the lm_head (output) layer of the model. Default is 6, although that
|
||||
value actually results in a mixed-precision quantization of about 6.3 bits. Options are 2, 3, 4, 5, 6 and 8. (Only 6
|
||||
and 8 appear to be useful.)
|
||||
|
||||
|
||||
- **-ss / --shard_size *float***: Output shard size, in megabytes. Default is 8192. Set this to 0 to disable sharding.
|
||||
Note that writing a very large `.safetensors` file can require a lot of system RAM.
|
||||
|
||||
|
||||
- **-ra / --rope_alpha *float***: RoPE (NTK) alpha to apply to base model for calibration.
|
||||
|
||||
|
||||
- **-rs / --rope_scale *float***: RoPE scaling factor to apply to base model for calibration.
|
||||
|
||||
|
||||
### Notes
|
||||
|
||||
The converter works in two passes; first it measures how quantization impacts each matrix in the model, and then it
|
||||
|
||||
Reference in New Issue
Block a user