diff --git a/convert.py b/convert.py index 9634a4d..aa65d7e 100644 --- a/convert.py +++ b/convert.py @@ -1,313 +1 @@ -from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Tokenizer -from exllamav2.architecture import RopeStyle -import argparse, os, shutil -import sys -import json -from conversion.tokenize import tokenize -from conversion.measure import embeddings, measure_quant -from conversion.quantize import quant -from conversion.optimize import optimize -from conversion.compile import compile_model -from conversion.qparams import qparams_headoptions -import torch - -parser = argparse.ArgumentParser(description = "Convert model to ExLlamaV2") -parser.add_argument("-i", "--in_dir", type = str, help = "Input directory", default = "") -parser.add_argument("-o", "--out_dir", type = str, help = "Output (working) directory") -parser.add_argument("-res", "--resume", action = "store_true", help = "Resume job from specified output directory (without specifying other options)") -parser.add_argument("-nr", "--no_resume", action = "store_true", help = "Do not resume an interrupted job (deletes all files in the output directory)") -parser.add_argument("-cf", "--compile_full", type = str, help = "Output folder for compiled model with all config/tokenizer files") -parser.add_argument("-c", "--cal_dataset", type = str, help = "Calibration dataset (.parquet file)") -parser.add_argument("-b", "--bits", type = float, default = 4.125, help = "Target bits per weight") -parser.add_argument("-ss", "--shard_size", type = float, help = "Max shard size in MB (default: 8192)", default = 8192) -parser.add_argument("-rs", "--rope_scale", type = float, help = "RoPE scaling factor") -parser.add_argument("-ra", "--rope_alpha", type = float, help = "RoPE alpha value (NTK)") -parser.add_argument("-hb", "--head_bits", type = int, default = 6, help = "Target bits per weight (head layer)") -parser.add_argument("-om", "--output_measurement", type = str, help = "Only perform measurement pass, then save measurement to the specified file") -parser.add_argument("-m", "--measurement", type = str, help = "Reuse previous measurement") -parser.add_argument("-r", "--dataset_rows", type = int, default = 100, help = "Number of rows to apply from dataset") -parser.add_argument("-mr", "--measurement_rows", type = int, default = 16, help = "Number of rows to apply from dataset when measuring") -parser.add_argument("-l", "--length", type = int, default = 2048, help = "Max no. tokens per sample") -parser.add_argument("-ml", "--measurement_length", type = int, default = 2048, help = "Max no. tokens per sample when measuring") -parser.add_argument("-so", "--status_output", action = "store_true", help = "Include machine-parseable status updates in console output") -parser.add_argument("-hsol", "--hidden_state_offload_layers", type = int, default = 0, help = "Number of hidden/target states to keep in VRAM. Speed-up but increases VRAM usage") - -args = parser.parse_args() - -torch.set_printoptions(precision = 7, sci_mode = False, linewidth = 200) - -# Check some args - -resuming = False -if args.out_dir: - if not args.no_resume: - if os.path.exists(os.path.join(args.out_dir, "job_new.json")): - resuming = True -else: - print(" ## Please specify output/working directory (-o, --out_dir)") - sys.exit() - -if not args.in_dir and not resuming: - print(" ## Please specify input model directory (-i, --in_dir)") - sys.exit() - -if args.length > 2048 or args.measurement_length > 2048: - print(" !! Warning: calibration rows > 2048 tokens may result in excessive VRAM use") - -if not args.head_bits in qparams_headoptions: - print(f" ## Error: {args.head_bits} is not a supported option for head layer bitrate") - sys.exit() - -if args.output_measurement is not None and args.compile_full is not None: - print(" ## Conflicting options: --output_measurement and --compile_full") - sys.exit() - -if args.bits < 2 or args.bits > 8: - print(f" !! Warning: target bitrate {args.bits} will likely not be attainable") - -if not os.path.exists(args.out_dir): - print(f" ## Error: Directory not found: {args.out_dir}") - sys.exit() - -# Create job - -def save_job(): - global job_file, job - with open(job_file, "w", encoding = "utf8") as f: - f.write(json.dumps(job, indent = 4)) - -job_file = os.path.join(args.out_dir, "job_new.json") - -if args.no_resume or not os.path.exists(job_file): - - print(f" -- Beginning new job") - if len(os.listdir(args.out_dir)) != 0: - print(f" !! Warning: Output directory is not empty: {args.out_dir}") - - if args.no_resume: - print(f" !! Cleaning output directory: {args.out_dir}") - for filename in os.listdir(args.out_dir): - file_path = os.path.join(args.out_dir, filename) - if os.path.isfile(file_path): - os.unlink(file_path) - elif os.path.isdir(file_path): - shutil.rmtree(file_path) - -output_measurement = args.output_measurement -if output_measurement is not None: - if os.path.isdir(output_measurement): - output_measurement = os.path.join(output_measurement, "measurement.json") - -job = {"in_dir": args.in_dir, - "out_dir": args.out_dir, - "cal_dataset": args.cal_dataset, - "bits": args.bits, - "dataset_rows": args.dataset_rows, - "measurement_rows": args.measurement_rows, - "length": args.length, - "measurement_length": args.measurement_length, - "head_bits": args.head_bits, - "shard_size": args.shard_size if args.shard_size > 0 else 1024 ** 3, # 1 PB = unlimited, - "compile_full": args.compile_full, - "rope_scale": args.rope_scale, - "rope_alpha": args.rope_alpha, - "output_measurement": output_measurement, - "progress": "begin"} - -if args.measurement is not None: - with open(args.measurement, "r", encoding = "utf8") as f: - imp_measurement = json.load(f) - job["measurement"] = imp_measurement["measurement"] - job["last_module_idx"] = imp_measurement["last_module_idx"] - job["reuse_measurement"] = args.measurement - -# Resume existing job - -if args.no_resume or not os.path.exists(job_file): - pass - -else: - print(f" -- Resuming job") - if args.in_dir: - print(f" !! Note: Overriding options with settings from existing job") - - with open(job_file, "r", encoding = "utf8") as f: - resume_job = json.load(f) - - # Override keys in existing job - del resume_job["out_dir"] - - job.update(resume_job) - if "invalid" in job: - print(" ** Error: Corrupted job") - sys.exit() - - if job["progress"] == "finished": - print(" !! Job is already finished") - sys.exit() - -# Feedback - -print(f" -- Input: {job['in_dir']}") -print(f" -- Output: {job['out_dir']}") -if job.get("cal_dataset"): - print(f" -- Calibration dataset: {job['cal_dataset']}, {job['dataset_rows']} / {job['measurement_rows']} rows, {job['length']} tokens per sample") -else: - print(f" -- Using default calibration dataset") -if job["output_measurement"] is None: - print(f" -- Target bits per weight: {job['bits']} (decoder), {job['head_bits']} (head)") - print(f" -- Max shard size: {job['shard_size']} MB") -else: - print(f" -- Measurement will be saved to {job['output_measurement']}") - print(f" !! Conversion script will end after measurement pass") - -if job['rope_scale']: print(f" -- RoPE scale: {job['rope_scale']:.2f}") -if job['rope_alpha']: print(f" -- RoPE alpha: {job['rope_alpha']:.2f}") - -# Make sure subfolders exist - -if job.get("compile_full"): - print(f" -- Full model will be compiled to: {job['compile_full']}") - if os.path.exists(job["compile_full"]): - if not os.path.isdir(job["compile_full"]): - print(f" ## Error: Output path {job['compile_full']} exists but is not a directory") - sys.exit() - if len(os.listdir(job["compile_full"])) > 0: - print(f" !! Warning: Output path {job['compile_full']} exists but is not empty") - -out_tensor_dir = os.path.join(job["out_dir"], "out_tensor") -if not os.path.exists(out_tensor_dir): - os.makedirs(out_tensor_dir) - -# Create config - -config = ExLlamaV2Config() -config.model_dir = job['in_dir'] -config.qkv_embed = False -config.prepare() - -# Tokenizer - -tokenizer = ExLlamaV2Tokenizer(config) - -# Set scaling for input model - -if job["rope_scale"] is not None: config.scale_pos_emb = job["rope_scale"] -if job["rope_alpha"] is not None: config.scale_alpha_value = job["rope_alpha"] - -# Create model without loading weights - -model = ExLlamaV2(config) -model.load(lazy = True) - -# Limit context length if necessary - -if model.config.arch.rope_style == RopeStyle.NONE: - max_ctx = model.config.max_seq_len - if job["length"] > max_ctx: - print (f" !! Warning: Reducing calibration length to model max context: {max_ctx}") - job["length"] = max_ctx - if job["measurement_length"] > max_ctx: - print (f" !! Warning: Reducing measurement calibration length to model max context: {max_ctx}") - job["measurement_length"] = max_ctx - -# Overridable settings - -job["status_output"] = args.status_output - -# Do the things - -save_job() - -while True: - - progress = job["progress"] - - if progress == "begin": - - if "reuse_measurement" in job: - - print(f" -- Reusing measurement: {job['reuse_measurement']}") - job["progress"] = "optimize" - save_job() - - else: - - print(f" -- Tokenizing samples (measurement)...") - tokenize(job, save_job, tokenizer, measure = True) - job["progress"] = "initial_embeddings" - save_job() - - if progress == "initial_embeddings": - - print(f" -- Token embeddings (measurement)...") - embeddings(job, save_job, model) - job["progress"] = "measure_quant" - save_job() - - if progress == "measure_quant": - print(f" -- Measuring quantization impact...") - - model.unload() - config.max_output_len = 16 - model = ExLlamaV2(config) - model.load(lazy = True) - - status = measure_quant(job, save_job, model, args.hidden_state_offload_layers) # capturing the graceful exits - if status == "interrupted": - print("Process interrupted. Exiting gracefully.") - save_job() - sys.exit(1) - if job["output_measurement"] is None: - job["progress"] = "optimize" - else: - job["progress"] = "finished" - save_job() - - model.unload() - config.max_output_len = None - model = ExLlamaV2(config) - model.load(lazy = True) - - if progress == "optimize": - - print(f" -- Optimizing...") - optimize(job, save_job, model) - job["progress"] = "tokens_cal" - save_job() - - if progress == "tokens_cal": - - print(f" -- Tokenizing samples...") - tokenize(job, save_job, tokenizer) - job["progress"] = "embeddings" - save_job() - - if progress == "embeddings": - print(f" -- Token embeddings again...") - embeddings(job, save_job, model) - job["progress"] = "quant" - save_job() - - if progress == "quant": - - print(f" -- Quantizing...") - quant(job, save_job, model) - job["progress"] = "compile" - save_job() - - if progress == "compile": - - print(f" -- Compiling output file...") - compile_model(job, save_job, model) - job["progress"] = "finished" - save_job() - - if progress == "finished": break - -print(f" -- Finished") - - - - - +import exllamav2.conversion.convert_exl2 diff --git a/conversion/adaptivegptq.py b/exllamav2/conversion/adaptivegptq.py similarity index 100% rename from conversion/adaptivegptq.py rename to exllamav2/conversion/adaptivegptq.py diff --git a/conversion/bot_status.py b/exllamav2/conversion/bot_status.py similarity index 100% rename from conversion/bot_status.py rename to exllamav2/conversion/bot_status.py diff --git a/conversion/compile.py b/exllamav2/conversion/compile.py similarity index 99% rename from conversion/compile.py rename to exllamav2/conversion/compile.py index 92d496f..2273396 100644 --- a/conversion/compile.py +++ b/exllamav2/conversion/compile.py @@ -17,7 +17,7 @@ import torch import os, glob, shutil, json from safetensors import safe_open from safetensors.torch import save_file -from conversion.bot_status import print_stage +from exllamav2.conversion.bot_status import print_stage def _tsize(t): diff --git a/exllamav2/conversion/convert_exl2.py b/exllamav2/conversion/convert_exl2.py new file mode 100644 index 0000000..8a1f194 --- /dev/null +++ b/exllamav2/conversion/convert_exl2.py @@ -0,0 +1,313 @@ +from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Tokenizer +from exllamav2.architecture import RopeStyle +import argparse, os, shutil +import sys +import json +from exllamav2.conversion.tokenize import tokenize +from exllamav2.conversion.measure import embeddings, measure_quant +from exllamav2.conversion.quantize import quant +from exllamav2.conversion.optimize import optimize +from exllamav2.conversion.compile import compile_model +from exllamav2.conversion.qparams import qparams_headoptions +import torch + +parser = argparse.ArgumentParser(description = "Convert model to ExLlamaV2") +parser.add_argument("-i", "--in_dir", type = str, help = "Input directory", default = "") +parser.add_argument("-o", "--out_dir", type = str, help = "Output (working) directory") +parser.add_argument("-res", "--resume", action = "store_true", help = "Resume job from specified output directory (without specifying other options)") +parser.add_argument("-nr", "--no_resume", action = "store_true", help = "Do not resume an interrupted job (deletes all files in the output directory)") +parser.add_argument("-cf", "--compile_full", type = str, help = "Output folder for compiled model with all config/tokenizer files") +parser.add_argument("-c", "--cal_dataset", type = str, help = "Calibration dataset (.parquet file)") +parser.add_argument("-b", "--bits", type = float, default = 4.125, help = "Target bits per weight") +parser.add_argument("-ss", "--shard_size", type = float, help = "Max shard size in MB (default: 8192)", default = 8192) +parser.add_argument("-rs", "--rope_scale", type = float, help = "RoPE scaling factor") +parser.add_argument("-ra", "--rope_alpha", type = float, help = "RoPE alpha value (NTK)") +parser.add_argument("-hb", "--head_bits", type = int, default = 6, help = "Target bits per weight (head layer)") +parser.add_argument("-om", "--output_measurement", type = str, help = "Only perform measurement pass, then save measurement to the specified file") +parser.add_argument("-m", "--measurement", type = str, help = "Reuse previous measurement") +parser.add_argument("-r", "--dataset_rows", type = int, default = 100, help = "Number of rows to apply from dataset") +parser.add_argument("-mr", "--measurement_rows", type = int, default = 16, help = "Number of rows to apply from dataset when measuring") +parser.add_argument("-l", "--length", type = int, default = 2048, help = "Max no. tokens per sample") +parser.add_argument("-ml", "--measurement_length", type = int, default = 2048, help = "Max no. tokens per sample when measuring") +parser.add_argument("-so", "--status_output", action = "store_true", help = "Include machine-parseable status updates in console output") +parser.add_argument("-hsol", "--hidden_state_offload_layers", type = int, default = 0, help = "Number of hidden/target states to keep in VRAM. Speed-up but increases VRAM usage") + +args = parser.parse_args() + +torch.set_printoptions(precision = 7, sci_mode = False, linewidth = 200) + +# Check some args + +resuming = False +if args.out_dir: + if not args.no_resume: + if os.path.exists(os.path.join(args.out_dir, "job_new.json")): + resuming = True +else: + print(" ## Please specify output/working directory (-o, --out_dir)") + sys.exit() + +if not args.in_dir and not resuming: + print(" ## Please specify input model directory (-i, --in_dir)") + sys.exit() + +if args.length > 2048 or args.measurement_length > 2048: + print(" !! Warning: calibration rows > 2048 tokens may result in excessive VRAM use") + +if not args.head_bits in qparams_headoptions: + print(f" ## Error: {args.head_bits} is not a supported option for head layer bitrate") + sys.exit() + +if args.output_measurement is not None and args.compile_full is not None: + print(" ## Conflicting options: --output_measurement and --compile_full") + sys.exit() + +if args.bits < 2 or args.bits > 8: + print(f" !! Warning: target bitrate {args.bits} will likely not be attainable") + +if not os.path.exists(args.out_dir): + print(f" ## Error: Directory not found: {args.out_dir}") + sys.exit() + +# Create job + +def save_job(): + global job_file, job + with open(job_file, "w", encoding = "utf8") as f: + f.write(json.dumps(job, indent = 4)) + +job_file = os.path.join(args.out_dir, "job_new.json") + +if args.no_resume or not os.path.exists(job_file): + + print(f" -- Beginning new job") + if len(os.listdir(args.out_dir)) != 0: + print(f" !! Warning: Output directory is not empty: {args.out_dir}") + + if args.no_resume: + print(f" !! Cleaning output directory: {args.out_dir}") + for filename in os.listdir(args.out_dir): + file_path = os.path.join(args.out_dir, filename) + if os.path.isfile(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + +output_measurement = args.output_measurement +if output_measurement is not None: + if os.path.isdir(output_measurement): + output_measurement = os.path.join(output_measurement, "measurement.json") + +job = {"in_dir": args.in_dir, + "out_dir": args.out_dir, + "cal_dataset": args.cal_dataset, + "bits": args.bits, + "dataset_rows": args.dataset_rows, + "measurement_rows": args.measurement_rows, + "length": args.length, + "measurement_length": args.measurement_length, + "head_bits": args.head_bits, + "shard_size": args.shard_size if args.shard_size > 0 else 1024 ** 3, # 1 PB = unlimited, + "compile_full": args.compile_full, + "rope_scale": args.rope_scale, + "rope_alpha": args.rope_alpha, + "output_measurement": output_measurement, + "progress": "begin"} + +if args.measurement is not None: + with open(args.measurement, "r", encoding = "utf8") as f: + imp_measurement = json.load(f) + job["measurement"] = imp_measurement["measurement"] + job["last_module_idx"] = imp_measurement["last_module_idx"] + job["reuse_measurement"] = args.measurement + +# Resume existing job + +if args.no_resume or not os.path.exists(job_file): + pass + +else: + print(f" -- Resuming job") + if args.in_dir: + print(f" !! Note: Overriding options with settings from existing job") + + with open(job_file, "r", encoding = "utf8") as f: + resume_job = json.load(f) + + # Override keys in existing job + del resume_job["out_dir"] + + job.update(resume_job) + if "invalid" in job: + print(" ** Error: Corrupted job") + sys.exit() + + if job["progress"] == "finished": + print(" !! Job is already finished") + sys.exit() + +# Feedback + +print(f" -- Input: {job['in_dir']}") +print(f" -- Output: {job['out_dir']}") +if job.get("cal_dataset"): + print(f" -- Calibration dataset: {job['cal_dataset']}, {job['dataset_rows']} / {job['measurement_rows']} rows, {job['length']} tokens per sample") +else: + print(f" -- Using default calibration dataset") +if job["output_measurement"] is None: + print(f" -- Target bits per weight: {job['bits']} (decoder), {job['head_bits']} (head)") + print(f" -- Max shard size: {job['shard_size']} MB") +else: + print(f" -- Measurement will be saved to {job['output_measurement']}") + print(f" !! Conversion script will end after measurement pass") + +if job['rope_scale']: print(f" -- RoPE scale: {job['rope_scale']:.2f}") +if job['rope_alpha']: print(f" -- RoPE alpha: {job['rope_alpha']:.2f}") + +# Make sure subfolders exist + +if job.get("compile_full"): + print(f" -- Full model will be compiled to: {job['compile_full']}") + if os.path.exists(job["compile_full"]): + if not os.path.isdir(job["compile_full"]): + print(f" ## Error: Output path {job['compile_full']} exists but is not a directory") + sys.exit() + if len(os.listdir(job["compile_full"])) > 0: + print(f" !! Warning: Output path {job['compile_full']} exists but is not empty") + +out_tensor_dir = os.path.join(job["out_dir"], "out_tensor") +if not os.path.exists(out_tensor_dir): + os.makedirs(out_tensor_dir) + +# Create config + +config = ExLlamaV2Config() +config.model_dir = job['in_dir'] +config.qkv_embed = False +config.prepare() + +# Tokenizer + +tokenizer = ExLlamaV2Tokenizer(config) + +# Set scaling for input model + +if job["rope_scale"] is not None: config.scale_pos_emb = job["rope_scale"] +if job["rope_alpha"] is not None: config.scale_alpha_value = job["rope_alpha"] + +# Create model without loading weights + +model = ExLlamaV2(config) +model.load(lazy = True) + +# Limit context length if necessary + +if model.config.arch.rope_style == RopeStyle.NONE: + max_ctx = model.config.max_seq_len + if job["length"] > max_ctx: + print (f" !! Warning: Reducing calibration length to model max context: {max_ctx}") + job["length"] = max_ctx + if job["measurement_length"] > max_ctx: + print (f" !! Warning: Reducing measurement calibration length to model max context: {max_ctx}") + job["measurement_length"] = max_ctx + +# Overridable settings + +job["status_output"] = args.status_output + +# Do the things + +save_job() + +while True: + + progress = job["progress"] + + if progress == "begin": + + if "reuse_measurement" in job: + + print(f" -- Reusing measurement: {job['reuse_measurement']}") + job["progress"] = "optimize" + save_job() + + else: + + print(f" -- Tokenizing samples (measurement)...") + tokenize(job, save_job, tokenizer, measure = True) + job["progress"] = "initial_embeddings" + save_job() + + if progress == "initial_embeddings": + + print(f" -- Token embeddings (measurement)...") + embeddings(job, save_job, model) + job["progress"] = "measure_quant" + save_job() + + if progress == "measure_quant": + print(f" -- Measuring quantization impact...") + + model.unload() + config.max_output_len = 16 + model = ExLlamaV2(config) + model.load(lazy = True) + + status = measure_quant(job, save_job, model, args.hidden_state_offload_layers) # capturing the graceful exits + if status == "interrupted": + print("Process interrupted. Exiting gracefully.") + save_job() + sys.exit(1) + if job["output_measurement"] is None: + job["progress"] = "optimize" + else: + job["progress"] = "finished" + save_job() + + model.unload() + config.max_output_len = None + model = ExLlamaV2(config) + model.load(lazy = True) + + if progress == "optimize": + + print(f" -- Optimizing...") + optimize(job, save_job, model) + job["progress"] = "tokens_cal" + save_job() + + if progress == "tokens_cal": + + print(f" -- Tokenizing samples...") + tokenize(job, save_job, tokenizer) + job["progress"] = "embeddings" + save_job() + + if progress == "embeddings": + print(f" -- Token embeddings again...") + embeddings(job, save_job, model) + job["progress"] = "quant" + save_job() + + if progress == "quant": + + print(f" -- Quantizing...") + quant(job, save_job, model) + job["progress"] = "compile" + save_job() + + if progress == "compile": + + print(f" -- Compiling output file...") + compile_model(job, save_job, model) + job["progress"] = "finished" + save_job() + + if progress == "finished": break + +print(f" -- Finished") + + + + + diff --git a/conversion/measure.py b/exllamav2/conversion/measure.py similarity index 99% rename from conversion/measure.py rename to exllamav2/conversion/measure.py index 37b994e..f42b670 100644 --- a/conversion/measure.py +++ b/exllamav2/conversion/measure.py @@ -13,14 +13,14 @@ from exllamav2.model import \ from safetensors import safe_open from safetensors.torch import save_file -from conversion.qparams import QParams, qparams_headoptions, qparams_attn, qparams_mlp, get_qparams_reduced -from conversion.adaptivegptq import AdaptiveGPTQ +from exllamav2.conversion.qparams import QParams, qparams_headoptions, qparams_attn, qparams_mlp, get_qparams_reduced +from exllamav2.conversion.adaptivegptq import AdaptiveGPTQ import torch from torch import nn import os, time, math, json import torch.nn.functional as F import gc -from conversion.bot_status import print_stage +from exllamav2.conversion.bot_status import print_stage # graceful exiting import signal diff --git a/conversion/optimize.py b/exllamav2/conversion/optimize.py similarity index 98% rename from conversion/optimize.py rename to exllamav2/conversion/optimize.py index 03c1410..a4eef21 100644 --- a/conversion/optimize.py +++ b/exllamav2/conversion/optimize.py @@ -1,9 +1,9 @@ -from conversion.qparams import QParams +from exllamav2.conversion.qparams import QParams from exllamav2.ext import exllamav2_ext as ext_c, none_tensor import math import itertools import time -from conversion.bot_status import print_stage +from exllamav2.conversion.bot_status import print_stage def optimize(job, save_fn, model): diff --git a/conversion/qparams.py b/exllamav2/conversion/qparams.py similarity index 100% rename from conversion/qparams.py rename to exllamav2/conversion/qparams.py diff --git a/conversion/quantize.py b/exllamav2/conversion/quantize.py similarity index 98% rename from conversion/quantize.py rename to exllamav2/conversion/quantize.py index 59c557b..c6dd3ab 100644 --- a/conversion/quantize.py +++ b/exllamav2/conversion/quantize.py @@ -13,14 +13,14 @@ from exllamav2.model import \ from safetensors import safe_open from safetensors.torch import save_file -from conversion.qparams import QParams, qparams_headoptions, qparams_attn, qparams_mlp, get_qparams_reduced -from conversion.adaptivegptq import AdaptiveGPTQ +from exllamav2.conversion.qparams import QParams, qparams_headoptions, qparams_attn, qparams_mlp, get_qparams_reduced +from exllamav2.conversion.adaptivegptq import AdaptiveGPTQ import torch from torch import nn import os, time, math, json import torch.nn.functional as F import gc -from conversion.bot_status import print_stage +from exllamav2.conversion.bot_status import print_stage def list_live_tensors(): diff --git a/conversion/standard_cal_data/c4.utf8 b/exllamav2/conversion/standard_cal_data/c4.utf8 similarity index 100% rename from conversion/standard_cal_data/c4.utf8 rename to exllamav2/conversion/standard_cal_data/c4.utf8 diff --git a/conversion/standard_cal_data/code.utf8 b/exllamav2/conversion/standard_cal_data/code.utf8 similarity index 100% rename from conversion/standard_cal_data/code.utf8 rename to exllamav2/conversion/standard_cal_data/code.utf8 diff --git a/conversion/standard_cal_data/multilingual.utf8 b/exllamav2/conversion/standard_cal_data/multilingual.utf8 similarity index 100% rename from conversion/standard_cal_data/multilingual.utf8 rename to exllamav2/conversion/standard_cal_data/multilingual.utf8 diff --git a/conversion/standard_cal_data/technical.utf8 b/exllamav2/conversion/standard_cal_data/technical.utf8 similarity index 100% rename from conversion/standard_cal_data/technical.utf8 rename to exllamav2/conversion/standard_cal_data/technical.utf8 diff --git a/conversion/standard_cal_data/tiny.utf8 b/exllamav2/conversion/standard_cal_data/tiny.utf8 similarity index 100% rename from conversion/standard_cal_data/tiny.utf8 rename to exllamav2/conversion/standard_cal_data/tiny.utf8 diff --git a/conversion/standard_cal_data/wiki.utf8 b/exllamav2/conversion/standard_cal_data/wiki.utf8 similarity index 100% rename from conversion/standard_cal_data/wiki.utf8 rename to exllamav2/conversion/standard_cal_data/wiki.utf8 diff --git a/conversion/tokenize.py b/exllamav2/conversion/tokenize.py similarity index 99% rename from conversion/tokenize.py rename to exllamav2/conversion/tokenize.py index 6ffa204..ac7abe8 100644 --- a/conversion/tokenize.py +++ b/exllamav2/conversion/tokenize.py @@ -3,7 +3,7 @@ import pandas, fastparquet import os from safetensors.torch import save_file import random -from conversion.bot_status import print_stage +from exllamav2.conversion.bot_status import print_stage def get_tokens(num_rows, length, filename, tokenizer):