From fd78fe520a32ae6a0dab81727187c6a92142a9fa Mon Sep 17 00:00:00 2001
From: Jianwei Dong <dongjw24@mails.tsinghua.edu.cn>
Date: Mon, 1 Dec 2025 14:15:14 +0800
Subject: [PATCH] fix(scripts): resolve OOM when converting gpu weights and
 update README (#1640)

---
 kt-kernel/scripts/README.md              | 163 +++++++++++++++++----
 kt-kernel/scripts/convert_gpu_weights.py | 176 ++++++++++++++++-------
 2 files changed, 266 insertions(+), 73 deletions(-)

diff --git a/kt-kernel/scripts/README.md b/kt-kernel/scripts/README.md
index de3afc7..03585fc 100644
--- a/kt-kernel/scripts/README.md
+++ b/kt-kernel/scripts/README.md
@@ -3,7 +3,7 @@
 KT-Kernel provides weight conversion tools for CPU-GPU hybrid inference (e.g., integrating KTransformers with SGLang). Both tools work together to enable heterogeneous expert placement:
 
 - **CPU Weights (`convert_cpu_weights.py`)**: Quantize weights to INT4/INT8 with AMX optimization for CPU-resident "cold" experts
-- **GPU Weights (`convert_gpu_weights.py`)**: Apply GPTQ quantization (W4A16/W8A16) for GPU-resident "hot" experts
+- **GPU Weights (`convert_gpu_weights.py`)**: Apply GPTQ/RTN quantization (W4A16/W8A16) for GPU-resident "hot" experts
 
 ---
 
@@ -165,43 +165,118 @@ pip install accelerate transformers llmcompressor datasets
 **Required packages:**
 - `accelerate`: For distributed model loading and device mapping
 - `transformers`: For model and tokenizer loading
-- `llmcompressor`: For GPTQ quantization
-- `datasets`: For calibration data loading
+- `llmcompressor`: For quantization (supports GPTQ and RTN methods)
+- `datasets`: For calibration data loading (GPTQ only)
+
+**Documentation:** This tool is based on llmcompressor. For more details, see [llmcompressor quantization guide](https://docs.vllm.ai/projects/llm-compressor/en/latest/getting-started/compress/#select-a-quantization-method-and-scheme).
 
 ### Overview
 
-Apply GPTQ quantization to model weights for GPU-resident "hot" experts (frequently accessed) in CPU-GPU hybrid inference. This tool works together with `convert_cpu_weights.py` to enable heterogeneous expert placement:
+Apply weight quantization to model weights for GPU-resident "hot" experts (frequently accessed) in CPU-GPU hybrid inference. This tool works together with `convert_cpu_weights.py` to enable heterogeneous expert placement:
 
-- **GPU-resident experts** ("hot" experts) use GPTQ quantization (this tool) for efficient GPU memory usage
+- **GPU-resident experts** ("hot" experts) use GPTQ/RTN quantization (this tool) for efficient GPU memory usage
 - **CPU-resident experts** ("cold" experts) use AMX-optimized INT4/INT8 quantization (convert_cpu_weights.py)
 - **Attention layers, gates, and shared experts** remain in higher precision
 
 This approach maximizes throughput and resource utilization by intelligently distributing experts across CPUs and GPUs.
 
+### Quantization Methods
+
+#### 1. GPTQ (Calibration-based, Default)
+**Pros:**
+- Higher accuracy through calibration-based quantization
+- Recommended for production deployments
+
+**Cons:**
+- Requires calibration dataset
+- Slower quantization process
+- Higher memory requirements (needs Hessian matrix)
+
+#### 2. RTN (Round-To-Nearest)
+**Pros:**
+- Fast quantization (no calibration needed)
+- Lower memory requirements
+- Good for quick testing and prototyping
+
+**Cons:**
+- Slightly lower accuracy compared to GPTQ
+- No calibration optimization
+
 ### Quantization Types
 
-- **W4A16**: 4-bit weights, 16-bit activations (GPTQ4)
-- **W8A16**: 8-bit weights, 16-bit activations (GPTQ8)
+- **W4A16**: 4-bit weights, 16-bit activations (INT4)
+- **W8A16**: 8-bit weights, 16-bit activations (INT8)
 
 ### Basic Usage
 
+#### GPTQ Quantization (Recommended for Production)
 ```bash
 python scripts/convert_gpu_weights.py \
   --model_id /path/to/model \
   --output_dir /path/to/output \
+  --quant_method GPTQ \
   --quant_type W4A16
 ```
 
+#### RTN Quantization (Fast, for Testing)
+```bash
+python scripts/convert_gpu_weights.py \
+  --model_id /path/to/model \
+  --output_dir /path/to/output \
+  --quant_method RTN \
+  --quant_type W4A16
+```
+
+### Memory Requirements
+
+Understanding memory requirements is crucial for successful quantization. The requirements differ significantly between RTN and GPTQ methods.
+
+#### RTN Memory Requirements
+
+RTN only requires memory for quantization parameters (scales/zero-points):
+
+| Component | Requirement |
+|-----------|-------------|
+| **DRAM (CPU Memory)** | ≥ Total model parameters |
+| **VRAM (GPU Memory)** | ≥ Single layer parameters |
+
+**Example: DeepSeek-R1-0528-BF16 (684B parameters)**
+- DRAM: ~1368 GB (684B params × 2 bytes)
+- VRAM: ~22.4 GB (1 layer)
+
+#### GPTQ Memory Requirements
+
+GPTQ requires additional memory for Hessian matrices during calibration:
+
+| Component | Requirement |
+|-----------|-------------|
+| **DRAM (CPU Memory)** | ≥ Total model parameters |
+| **VRAM (GPU Memory)** | ≥ Single layer parameters × 2 |
+
+The Hessian matrix is approximately the same size as the layer weights and is used to increase accuracy recovery.
+
+**Example: DeepSeek-R1-0528-BF16 (684B parameters)**
+- DRAM: ~1368 GB (684B params × 2 bytes)
+- VRAM: ~44.8 GB (1 layer × 2 for Hessian matrix)
+
+#### Method Comparison
+
+| Method | Speed | VRAM | Accuracy | Use Case |
+|--------|-------|------|----------|----------|
+| **RTN** | Fast | Low (~22GB) | Good | Testing, prototyping |
+| **GPTQ** | Slow | High (~45GB) | Better | Production deployment |
+
 ### Advanced Options
 
-#### Calibration Configuration
+#### Calibration Configuration (GPTQ Only)
 
-Control the calibration process for better quantization quality:
+For GPTQ quantization, control the calibration process for better quantization quality:
 
 ```bash
 python scripts/convert_gpu_weights.py \
   --model_id /path/to/model \
   --output_dir /path/to/output \
+  --quant_method GPTQ \
   --quant_type W4A16 \
   --num_calibration_samples 512 \
   --max_sequence_length 2048 \
@@ -209,53 +284,91 @@ python scripts/convert_gpu_weights.py \
   --dataset_split train_sft
 ```
 
-**Options:**
+**Options (GPTQ only):**
 - `--num_calibration_samples`: Number of samples for calibration (default: 512)
 - `--max_sequence_length`: Maximum sequence length (default: 2048)
 - `--dataset`: HuggingFace dataset for calibration
 - `--dataset_split`: Dataset split to use
+- `--dampening_frac`: Dampening fraction to reduce quantization noise (default: 0.1)
 
-#### Memory Management (Avoiding OOM)
+#### Memory Management
 
-GPTQ quantization requires additional GPU memory for Hessian matrix computation beyond model weights. Use `--max_gpu_memory` to limit GPU memory usage and offload remaining layers to CPU:
+Use `--max_gpu_memory` to limit GPU memory usage and offload remaining layers to CPU:
 
 ```bash
 python scripts/convert_gpu_weights.py \
   --model_id /path/to/model \
   --output_dir /path/to/output \
+  --quant_method GPTQ \
   --quant_type W4A16 \
   --max_gpu_memory "40GiB"
 ```
 
-**Recommended settings:**
+**Recommended settings for GPTQ:**
 
-| GPU VRAM | Suggested `--max_gpu_memory` |
-|----------|------------------------------|
-| 24 GiB   | 14-16 GiB                    |
-| 48 GiB   | 30-35 GiB                    |
-| 80 GiB   | 50-60 GiB                    |
+| GPU VRAM | Suggested `--max_gpu_memory` | Notes |
+|----------|------------------------------|-------|
+| 24 GiB   | 10-12 GiB | Reserve ~50% for Hessian |
+| 48 GiB   | 24-30 GiB | Reserve ~40% for Hessian |
+| 80 GiB   | 40-50 GiB | Reserve ~40% for Hessian |
 
-Reserve 40-50% of GPU memory for GPTQ's Hessian matrix computation.
+**Recommended settings for RTN:**
+
+| GPU VRAM | Suggested `--max_gpu_memory` | Notes |
+|----------|------------------------------|-------|
+| 24 GiB   | 18-20 GiB | No Hessian needed |
+| 48 GiB   | 40-45 GiB | No Hessian needed |
+| 80 GiB   | 70-75 GiB | No Hessian needed |
 
 **Options:**
 - `--max_gpu_memory`: Maximum GPU memory for model weights per device (e.g., '40GiB')
 - `--max_cpu_memory`: Maximum CPU memory (default: 1000GiB when `--max_gpu_memory` is set)
 
 **Important:** llmcompressor does not support disk offloading. Ensure your machine has enough GPU + CPU memory to load the entire model. If you still encounter OOM:
-1. Reduce `--num_calibration_samples` (e.g., 256)
-2. Reduce `--max_sequence_length` (e.g., 1024)
-3. Use `--force_cpu` to run entirely on CPU (slower but avoids GPU OOM)
+1. Use RTN instead of GPTQ (requires less memory)
+2. Reduce `--num_calibration_samples` (GPTQ only, e.g., 256)
+3. Reduce `--max_sequence_length` (GPTQ only, e.g., 1024)
+4. Use `--force_cpu` to run entirely on CPU (slower but avoids GPU OOM)
 
 ### Examples
 
-#### Example 1: Quantize Qwen3-Next-80B for Hybrid Inference (W4A16)
+#### Example 1: GPTQ Quantization for Production (Qwen3-Next-80B, W4A16)
 
 ```bash
 python scripts/convert_gpu_weights.py \
-  --model_id /mnt/data/models/Qwen3-Next-80B-A3B-Thinking \
-  --output_dir /mnt/data/models/Qwen3-Next-80B-A3B-Thinking-GPTQ4 \
+  --model_id /mnt/data/models/Qwen3-Next-80B-A3B-Instruct \
+  --output_dir /mnt/data/models/Qwen3-Next-80B-A3B-Instruct-GPTQ-W4A16 \
+  --quant_method GPTQ \
   --quant_type W4A16 \
   --num_calibration_samples 512 \
   --max_sequence_length 2048 \
+  --max_gpu_memory "40GiB" \
+  --trust_remote_code
+```
+
+#### Example 2: RTN Quantization for Fast Testing (DeepSeek-R1, W4A16)
+
+```bash
+python scripts/convert_gpu_weights.py \
+  --model_id /mnt/data/models/DeepSeek-R1-0528-BF16 \
+  --output_dir /mnt/data/models/DeepSeek-R1-0528-RTN-W4A16 \
+  --quant_method RTN \
+  --quant_type W4A16 \
+  --max_gpu_memory "70GiB" \
+  --trust_remote_code
+```
+
+#### Example 3: GPTQ with Custom Calibration Dataset (GLM-4.5-Air, W8A16)
+
+```bash
+python scripts/convert_gpu_weights.py \
+  --model_id /mnt/data/models/GLM-4.5-Air \
+  --output_dir /mnt/data/models/GLM-4.5-Air-GPTQ-W8A16 \
+  --quant_method GPTQ \
+  --quant_type W8A16 \
+  --dataset "tatsu-lab/alpaca" \
+  --dataset_split "train" \
+  --num_calibration_samples 256 \
+  --max_gpu_memory "40GiB" \
   --trust_remote_code
 ```
diff --git a/kt-kernel/scripts/convert_gpu_weights.py b/kt-kernel/scripts/convert_gpu_weights.py
index fc695a1..d709a54 100644
--- a/kt-kernel/scripts/convert_gpu_weights.py
+++ b/kt-kernel/scripts/convert_gpu_weights.py
@@ -3,32 +3,49 @@
 GPU Weight Quantization Tool for KTransformers
 
 This script quantizes model weights for CPU-GPU hybrid inference when integrating
-KTransformers with SGLang. It applies selective quantization (GPTQ) to GPU-resident
-layers while preserving certain components (e.g., attention, gates, shared experts)
-in higher precision.
+KTransformers with SGLang. It supports multiple quantization methods (GPTQ, RTN) and
+applies selective quantization to GPU-resident layers while preserving certain
+components (e.g., attention, gates, shared experts) in higher precision.
 
 Usage:
-    python convert_gpu_weights.py --model_id /path/to/model --output_dir /path/to/output --quant_type W4A16
+    python convert_gpu_weights.py --model_id /path/to/model --output_dir /path/to/output --quant_method GPTQ --quant_type W4A16
 
-Example:
+Example (GPTQ with calibration for best accuracy):
     python convert_gpu_weights.py \
         --model_id /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct \
         --output_dir /mnt/data2/models/Qwen3-Next-80B-A3B-Instruct-GPU-weight \
+        --quant_method GPTQ \
         --quant_type W4A16
+
+Example (RTN for fast quantization without calibration):
     python convert_gpu_weights.py \
         --model_id /mnt/data/models/GLM-4.5-Air \
-        --output_dir /mnt/data/models/GLM-4.5-Air-GPU-weights-test \
+        --output_dir /mnt/data/models/GLM-4.5-Air-GPU-weights-rtn \
+        --quant_method RTN \
         --quant_type W4A16
 """
 
 import os
+import sys
 import warnings
 import argparse
+
+# IMPORTANT: Parse force_cpu argument BEFORE importing torch
+# CUDA_VISIBLE_DEVICES must be set before torch initializes CUDA
+if __name__ == "__main__":
+    # Quick check for --force_cpu flag before full argument parsing
+    if "--force_cpu" in sys.argv:
+        os.environ["CUDA_VISIBLE_DEVICES"] = ""
+        warnings.filterwarnings("ignore", message="Can't initialize NVML")
+        print("🔧 Forced CPU-only mode (CUDA_VISIBLE_DEVICES set before torch import)")
+
+# Now it's safe to import torch and other GPU-dependent libraries
 import torch
 from accelerate import init_empty_weights, infer_auto_device_map
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization.gptq import GPTQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 from datasets import load_dataset
 
 
@@ -40,33 +57,46 @@ def parse_args():
     parser.add_argument("--output_dir", type=str, required=True, help="Path to save the quantized model")
 
     # Optional arguments
+    parser.add_argument(
+        "--quant_method",
+        type=str,
+        choices=["GPTQ", "RTN"],
+        default="GPTQ",
+        help="Quantization method: GPTQ (calibration-based) or RTN (round-to-nearest, no calibration). Default: GPTQ",
+    )
     parser.add_argument(
         "--quant_type",
         type=str,
         choices=["W4A16", "W8A16"],
         default="W8A16",
-        help="Quantization type: W4A16 (GPTQ4) or W8A16 (GPTQ8). Default: W8A16",
+        help="Quantization type: W4A16 (INT4) or W8A16 (INT8). Default: W8A16",
     )
     parser.add_argument(
-        "--num_calibration_samples", type=int, default=512, help="Number of calibration samples. Default: 512"
+        "--num_calibration_samples",
+        type=int,
+        default=512,
+        help="Number of calibration samples (GPTQ only). Default: 512",
     )
     parser.add_argument(
-        "--max_sequence_length", type=int, default=2048, help="Maximum sequence length for calibration. Default: 2048"
+        "--max_sequence_length",
+        type=int,
+        default=2048,
+        help="Maximum sequence length for calibration (GPTQ only). Default: 2048",
     )
     parser.add_argument(
         "--dampening_frac",
         type=float,
         default=0.1,
-        help="Dampening fraction to mitigate quantization noise. Default: 0.1",
+        help="Dampening fraction to mitigate quantization noise (GPTQ only). Default: 0.1",
     )
     parser.add_argument(
         "--dataset",
         type=str,
         default="HuggingFaceH4/ultrachat_200k",
-        help="Dataset for calibration. Default: HuggingFaceH4/ultrachat_200k",
+        help="Dataset for calibration (GPTQ only). Default: HuggingFaceH4/ultrachat_200k",
     )
     parser.add_argument(
-        "--dataset_split", type=str, default="train_sft", help="Dataset split to use. Default: train_sft"
+        "--dataset_split", type=str, default="train_sft", help="Dataset split to use (GPTQ only). Default: train_sft"
     )
     parser.add_argument(
         "--force_cpu", action="store_true", help="Force all computations to CPU (sets CUDA_VISIBLE_DEVICES='')"
@@ -118,15 +148,24 @@ def parse_args():
 
 def setup_environment(force_cpu=False):
     """
-    Setup environment variables and warnings.
+    Verify environment setup (actual setup happens before torch import).
 
     Args:
-        force_cpu: If True, forces all computations to CPU by hiding GPUs
+        force_cpu: If True, was requested to force CPU-only mode
+
+    Note:
+        CUDA_VISIBLE_DEVICES must be set BEFORE importing torch.
+        The actual environment setup is done at module import time.
     """
     if force_cpu:
-        os.environ["CUDA_VISIBLE_DEVICES"] = ""
-        warnings.filterwarnings("ignore", message="Can't initialize NVML")
-        print("🔧 Forced CPU-only mode")
+        # Verify the environment variable was set correctly
+        cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if cuda_visible != "":
+            print("⚠️  Warning: force_cpu was requested but CUDA_VISIBLE_DEVICES is not empty")
+            print(f"   Current value: '{cuda_visible}'")
+            print("   This may happen if imported as a module. Recommend running as script.")
+        else:
+            print("✅ CPU-only mode verified (CUDA_VISIBLE_DEVICES is empty)")
 
 
 def get_torch_dtype(dtype_str):
@@ -242,9 +281,21 @@ def main():
     """
     Main function for GPU weight quantization.
 
-    This performs GPTQ quantization on model weights intended for GPU execution
-    in CPU-GPU hybrid inference scenarios. The quantization is selective:
-    - Expert MLP weights are quantized to INT4/INT8 (GPTQ)
+    This performs weight quantization on model weights intended for GPU execution
+    in CPU-GPU hybrid inference scenarios. Supports two quantization methods:
+
+    1. GPTQ (default): Calibration-based quantization for better accuracy
+       - Requires calibration dataset
+       - Higher accuracy but slower
+       - Recommended for production use
+
+    2. RTN (Round-To-Nearest): Fast quantization without calibration
+       - No calibration dataset needed
+       - Faster but may have lower accuracy
+       - Good for quick testing or prototyping
+
+    The quantization is selective:
+    - Expert MLP weights are quantized to INT4/INT8
     - Attention layers, gates, and shared experts remain in original precision
     - Dense layers (if present) are excluded from quantization
 
@@ -262,9 +313,13 @@ def main():
     print(f"🚀 Starting quantization process")
     print(f"   Model: {args.model_id}")
     print(f"   Output: {args.output_dir}")
-    print(f"   Quantization: {args.quant_type}")
-    print(f"   Calibration samples: {args.num_calibration_samples}")
-    print(f"   Max sequence length: {args.max_sequence_length}")
+    print(f"   Quantization method: {args.quant_method}")
+    print(f"   Quantization type: {args.quant_type}")
+    if args.quant_method == "GPTQ":
+        print(f"   Calibration samples: {args.num_calibration_samples}")
+        print(f"   Max sequence length: {args.max_sequence_length}")
+    else:
+        print(f"   Calibration: Not required for {args.quant_method}")
 
     # --------------------------------------------------------------------
     # 0) Check for dense layers and update ignore patterns
@@ -361,24 +416,36 @@ def main():
     # --------------------------------------------------------------------
     # 3) Prepare calibration dataset
     # GPTQ needs calibration data to compute optimal quantization parameters
-    ds = load_and_prepare_dataset(
-        args.dataset,
-        args.dataset_split,
-        args.num_calibration_samples,
-        args.max_sequence_length,
-        tokenizer,
-        args.random_seed,
-    )
+    if args.quant_method == "GPTQ":
+        ds = load_and_prepare_dataset(
+            args.dataset,
+            args.dataset_split,
+            args.num_calibration_samples,
+            args.max_sequence_length,
+            tokenizer,
+            args.random_seed,
+        )
 
     # --------------------------------------------------------------------
     # 4) Create quantization recipe with selective layer exclusion
-    print(f"⚙️  Setting up {args.quant_type} quantization recipe...")
-    recipe = GPTQModifier(
-        targets="Linear",  # Target all Linear layers
-        scheme=args.quant_type,  # W4A16 or W8A16
-        ignore=updated_ignore_patterns,  # Exclude specific patterns
-        dampening_frac=args.dampening_frac,
-    )
+    print(f"⚙️  Setting up {args.quant_method} {args.quant_type} quantization recipe...")
+    if args.quant_method == "GPTQ":
+        # GPTQ: calibration-based quantization for better accuracy
+        recipe = GPTQModifier(
+            targets="Linear",  # Target all Linear layers
+            scheme=args.quant_type,  # W4A16 or W8A16
+            ignore=updated_ignore_patterns,  # Exclude specific patterns
+            dampening_frac=args.dampening_frac,
+        )
+    elif args.quant_method == "RTN":
+        # RTN (Round-To-Nearest): fast quantization without calibration
+        recipe = QuantizationModifier(
+            targets="Linear",  # Target all Linear layers
+            scheme=args.quant_type,  # W4A16 or W8A16
+            ignore=updated_ignore_patterns,  # Exclude specific patterns
+        )
+    else:
+        raise ValueError(f"Unsupported quantization method: {args.quant_method}")
 
     print("🔧 Ignoring the following patterns from quantization:")
     for i, pattern in enumerate(updated_ignore_patterns):
@@ -386,19 +453,32 @@ def main():
         print(f"   {marker} {pattern}")
 
     # --------------------------------------------------------------------
-    # 5) Perform one-shot GPTQ quantization
-    # This applies GPTQ to quantize weights while minimizing accuracy loss
+    # 5) Perform one-shot quantization
+    # GPTQ: calibration-based quantization to minimize accuracy loss
+    # RTN: fast round-to-nearest quantization without calibration
     print("🎯 Starting one-shot quantization...")
-    oneshot(
-        model=model,
-        dataset=ds,
-        recipe=recipe,
-        output_dir=args.output_dir,
-        max_seq_length=args.max_sequence_length,
-        num_calibration_samples=args.num_calibration_samples,
-    )
+    if args.quant_method == "GPTQ":
+        # GPTQ requires calibration dataset
+        oneshot(
+            model=model,
+            dataset=ds,
+            recipe=recipe,
+            output_dir=args.output_dir,
+            max_seq_length=args.max_sequence_length,
+            num_calibration_samples=args.num_calibration_samples,
+        )
+    elif args.quant_method == "RTN":
+        # RTN does not require calibration dataset
+        oneshot(
+            model=model,
+            recipe=recipe,
+            output_dir=args.output_dir,
+        )
+    else:
+        raise ValueError(f"Unsupported quantization method: {args.quant_method}")
 
     print(f"\n✅ Quantized model written to: {args.output_dir}")
+    print(f"   Quantization method: {args.quant_method}")
     print(f"   Quantization type: {args.quant_type}")
     print(f"   Ignored patterns remain in {args.torch_dtype}")
     print("🎉 Quantization completed successfully!")