Add RoPE arguments to quantizer script

2026-04-20 14:29:28 +00:00 · 2023-11-21 05:13:37 +01:00
parent cb33b338f4
commit 09b981fa57
2 changed files with 29 additions and 1 deletions
--- a/convert.py
+++ b/convert.py
@@ -27,6 +27,8 @@ parser.add_argument("-b", "--bits", type = float, default = 4.125, help = "Targe
 parser.add_argument("-hb", "--head_bits", type = int, default = 6, help = "Target bits per weight (head layer)")
 parser.add_argument("-m", "--measurement", type = str, help = "Reuse previous measurement")
 parser.add_argument("-ss", "--shard_size", type = float, help = "Max shard size in MB (default: 8192)", default = 8192)
+parser.add_argument("-rs", "--rope_scale", type = float, default = 1.0, help = "RoPE scaling factor")
+parser.add_argument("-ra", "--rope_alpha", type = float, default = 1.0, help = "RoPE alpha value (NTK)")

 args = parser.parse_args()

@@ -77,6 +79,8 @@ output_measurement = args.output_measurement
 if output_measurement is not None:
    if os.path.isdir(output_measurement):
        output_measurement = os.path.join(output_measurement, "measurement.json")
+rope_scale = args.rope_scale
+rope_alpha = args.rope_alpha

 compile_full = args.compile_full

@@ -142,7 +146,9 @@ if no_resume or not os.path.exists(job_file):
            "progress": "begin",
            "shard_size": shard_size,
            "output_measurement": output_measurement,
-            "compile_full": compile_full
+            "compile_full": compile_full,
+            "rope_scale": rope_scale,
+            "rope_alpha": rope_alpha
            }

    if reuse_measurement is not None:
@@ -190,6 +196,12 @@ else:
    print(f" -- Measurement will be saved to {job['output_measurement']}")
    print(f" !! Conversion script will end after measurement pass")

+if job["rope_scale"] is not None:
+    print(f" -- RoPE scale: {job['rope_scale']:.2f}")
+
+if job["rope_alpha"] is not None:
+    print(f" -- RoPE alpha: {job['rope_alpha']:.2f}")
+
 # Make sure subfolders exist

 if job["compile_full"] is not None:
@@ -211,6 +223,14 @@ max_l = max(job["measurement_length"], job["length"])
 config.max_input_len = max_l
 config.max_attention_size = max_l ** 2

+# Set scaling for input model
+
+if job["rope_scale"] is not None:
+    config.scale_pos_emb = job["rope_scale"]
+
+if job["rope_alpha"] is not None:
+    config.scale_alpha_value = job["rope_alpha"]
+
 # Create model without loading weights

 model = ExLlamaV2(config)
--- a/doc/convert.md
+++ b/doc/convert.md
@@ -68,10 +68,18 @@ to run without swapping for smaller models and have to set **-gr** to zero for l
 - **-hb / --bits *int***: Number of bits for the lm_head (output) layer of the model. Default is 6, although that
 value actually results in a mixed-precision quantization of about 6.3 bits. Options are 2, 3, 4, 5, 6 and 8. (Only 6
 and 8 appear to be useful.)
+
  
 - **-ss / --shard_size *float***: Output shard size, in megabytes. Default is 8192. Set this to 0 to disable sharding.
 Note that writing a very large `.safetensors` file can require a lot of system RAM.

+
+- **-ra / --rope_alpha *float***: RoPE (NTK) alpha to apply to base model for calibration.
+
+
+- **-rs / --rope_scale *float***: RoPE scaling factor to apply to base model for calibration.
+
+
 ### Notes

 The converter works in two passes; first it measures how quantization impacts each matrix in the model, and then it