mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-01 03:31:15 +00:00
@@ -16,7 +16,7 @@ import numpy as np
|
|||||||
|
|
||||||
# Add parent directory to path to import kt_kernel
|
# Add parent directory to path to import kt_kernel
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from kt_kernel import AMXMoEWrapper
|
from kt_kernel import KTMoEWrapper
|
||||||
|
|
||||||
import triton
|
import triton
|
||||||
import triton.language as tl
|
import triton.language as tl
|
||||||
@@ -759,7 +759,7 @@ class OnlineQuantConverter(ConverterBase):
|
|||||||
|
|
||||||
# Create AMXMoEWrapper instance for this layer
|
# Create AMXMoEWrapper instance for this layer
|
||||||
# num_gpu_experts=0 since we're converting all experts to CPU format
|
# num_gpu_experts=0 since we're converting all experts to CPU format
|
||||||
wrapper = AMXMoEWrapper(
|
wrapper = KTMoEWrapper(
|
||||||
layer_idx=layer_idx,
|
layer_idx=layer_idx,
|
||||||
num_experts=self.num_experts,
|
num_experts=self.num_experts,
|
||||||
num_experts_per_tok=self.num_experts_per_tok,
|
num_experts_per_tok=self.num_experts_per_tok,
|
||||||
@@ -768,10 +768,10 @@ class OnlineQuantConverter(ConverterBase):
|
|||||||
num_gpu_experts=0, # All experts on CPU for conversion
|
num_gpu_experts=0, # All experts on CPU for conversion
|
||||||
cpuinfer_threads=self.cpuinfer_threads,
|
cpuinfer_threads=self.cpuinfer_threads,
|
||||||
threadpool_count=self.threadpool_count,
|
threadpool_count=self.threadpool_count,
|
||||||
amx_weight_path=self.output_path, # Output path for quantized weights
|
weight_path=self.output_path, # Output path for quantized weights
|
||||||
chunked_prefill_size=512, # Arbitrary value, not critical for conversion
|
chunked_prefill_size=512, # Arbitrary value, not critical for conversion
|
||||||
cpu_save=True, # Enable saving quantized weights to output
|
cpu_save=True, # Enable saving quantized weights to output
|
||||||
amx_method=amx_method, # Specify quantization method (AMXINT4 or AMXINT8)
|
method=amx_method, # Specify quantization method (AMXINT4 or AMXINT8)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Load and quantize weights from tensors
|
# Load and quantize weights from tensors
|
||||||
|
|||||||
Reference in New Issue
Block a user