mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
llama-quantize: --dry-run option (#1309)
This commit is contained in:
@@ -355,6 +355,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.quantize_output_tensor = false;
|
params.quantize_output_tensor = false;
|
||||||
} else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
|
} else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
|
||||||
params.ignore_imatrix_rules = true;
|
params.ignore_imatrix_rules = true;
|
||||||
|
} else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
|
||||||
|
params.dry_run = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--repack") == 0) {
|
} else if (strcmp(argv[arg_idx], "--repack") == 0) {
|
||||||
params.only_repack = true;
|
params.only_repack = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {
|
} else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {
|
||||||
|
|||||||
@@ -490,6 +490,7 @@ extern "C" {
|
|||||||
bool keep_split; // quantize to the same number of shards
|
bool keep_split; // quantize to the same number of shards
|
||||||
bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
|
bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
|
||||||
bool only_repack; // Only repack tensors
|
bool only_repack; // Only repack tensors
|
||||||
|
bool dry_run; //
|
||||||
void * imatrix; // pointer to importance matrix data
|
void * imatrix; // pointer to importance matrix data
|
||||||
void * kv_overrides; // pointer to vector containing overrides
|
void * kv_overrides; // pointer to vector containing overrides
|
||||||
void * custom_quants; // pointer to vector containing custom quantization rules
|
void * custom_quants; // pointer to vector containing custom quantization rules
|
||||||
|
|||||||
@@ -1190,6 +1190,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
auto new_ofstream = [&](int index) {
|
auto new_ofstream = [&](int index) {
|
||||||
|
if (params->dry_run) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
cur_split = index;
|
cur_split = index;
|
||||||
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
|
||||||
std::string fname = fname_out;
|
std::string fname = fname_out;
|
||||||
@@ -1267,8 +1270,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||||
|
|
||||||
enum ggml_type new_type;
|
enum ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data = nullptr;
|
||||||
size_t new_size;
|
size_t new_size = 0;
|
||||||
|
|
||||||
if (params->only_repack) {
|
if (params->only_repack) {
|
||||||
ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor);
|
ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor);
|
||||||
@@ -1289,19 +1292,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
if (modify || repacked_type != tensor->type) {
|
if (modify || repacked_type != tensor->type) {
|
||||||
new_type = repacked_type;
|
new_type = repacked_type;
|
||||||
new_size = ggml_nbytes(tensor);
|
new_size = ggml_nbytes(tensor);
|
||||||
if ((int)work.size() < new_size) work.resize(new_size);
|
if (!params->dry_run) {
|
||||||
new_data = work.data();
|
if ((int)work.size() < new_size) work.resize(new_size);
|
||||||
|
new_data = work.data();
|
||||||
|
|
||||||
auto aux_tensor = *tensor;
|
auto aux_tensor = *tensor;
|
||||||
aux_tensor.data = work.data();
|
aux_tensor.data = work.data();
|
||||||
std::memcpy(aux_tensor.data, tensor->data, new_size);
|
std::memcpy(aux_tensor.data, tensor->data, new_size);
|
||||||
|
|
||||||
if (repacked_type != tensor->type) {
|
if (repacked_type != tensor->type) {
|
||||||
iqk_repack_tensor(&aux_tensor);
|
iqk_repack_tensor(&aux_tensor);
|
||||||
GGML_ASSERT(aux_tensor.type == repacked_type);
|
GGML_ASSERT(aux_tensor.type == repacked_type);
|
||||||
} else {
|
} else {
|
||||||
bool did_modify = iqk_modify_tensor(&aux_tensor);
|
bool did_modify = iqk_modify_tensor(&aux_tensor);
|
||||||
GGML_ASSERT(did_modify);
|
GGML_ASSERT(did_modify);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -1448,17 +1453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float * f32_data;
|
|
||||||
|
|
||||||
if (tensor->type == GGML_TYPE_F32) {
|
|
||||||
f32_data = (float *) tensor->data;
|
|
||||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
|
||||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
|
||||||
} else {
|
|
||||||
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
|
||||||
f32_data = (float *) f32_conv_buf.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
int chunk_size_multiplier = 1;
|
int chunk_size_multiplier = 1;
|
||||||
auto [working_type, num_rows] = interleaved_properties(new_type);
|
auto [working_type, num_rows] = interleaved_properties(new_type);
|
||||||
if (tensor->ne[1] % num_rows != 0) {
|
if (tensor->ne[1] % num_rows != 0) {
|
||||||
@@ -1470,30 +1464,45 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
if (work.size() < (size_t)nelements * 4) {
|
if (params->dry_run) {
|
||||||
work.resize(nelements * 4); // upper bound on size
|
new_size = tensor->ne[2] * tensor->ne[1] * ggml_row_size(new_type, tensor->ne[0]);
|
||||||
}
|
} else {
|
||||||
new_data = work.data();
|
float * f32_data;
|
||||||
|
|
||||||
const int64_t n_per_row = tensor->ne[0];
|
if (tensor->type == GGML_TYPE_F32) {
|
||||||
const int64_t nrows = tensor->ne[1];
|
f32_data = (float *) tensor->data;
|
||||||
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||||
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||||
|
} else {
|
||||||
|
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||||
|
f32_data = (float *) f32_conv_buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
static const int64_t min_chunk_size = 32 * 512;
|
if (work.size() < (size_t)nelements * 4) {
|
||||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
work.resize(nelements * 4); // upper bound on size
|
||||||
chunk_size_multiplier;
|
}
|
||||||
|
new_data = work.data();
|
||||||
|
|
||||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
const int64_t n_per_row = tensor->ne[0];
|
||||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
const int64_t nrows = tensor->ne[1];
|
||||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
|
||||||
|
|
||||||
// quantize each expert separately since they have different importance matrices
|
static const int64_t min_chunk_size = 32 * 512;
|
||||||
new_size = 0;
|
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
||||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
chunk_size_multiplier;
|
||||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
|
||||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
|
||||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
|
||||||
|
|
||||||
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||||
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||||
|
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||||
|
|
||||||
|
// quantize each expert separately since they have different importance matrices
|
||||||
|
new_size = 0;
|
||||||
|
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||||
|
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||||
|
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||||
|
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||||
|
|
||||||
|
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||||
}
|
}
|
||||||
@@ -1502,13 +1511,15 @@ QuantizationDone:;
|
|||||||
total_size_org += ggml_nbytes(tensor);
|
total_size_org += ggml_nbytes(tensor);
|
||||||
total_size_new += new_size;
|
total_size_new += new_size;
|
||||||
|
|
||||||
// update the gguf meta data as we go
|
if (!params->dry_run) {
|
||||||
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
// update the gguf meta data as we go
|
||||||
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
||||||
|
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
||||||
|
|
||||||
// write tensor data + padding
|
// write tensor data + padding
|
||||||
fout.write((const char *) new_data, new_size);
|
fout.write((const char *) new_data, new_size);
|
||||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
close_ofstream();
|
close_ofstream();
|
||||||
for (auto & c:ctx_outs) {
|
for (auto & c:ctx_outs) {
|
||||||
|
|||||||
@@ -4412,6 +4412,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||||||
/*.keep_split =*/ false,
|
/*.keep_split =*/ false,
|
||||||
/*.ignore_imatrix_rules =*/ false,
|
/*.ignore_imatrix_rules =*/ false,
|
||||||
/*.only_repack =*/ false,
|
/*.only_repack =*/ false,
|
||||||
|
/*.dry_run =*/ false,
|
||||||
/*.imatrix =*/ nullptr,
|
/*.imatrix =*/ nullptr,
|
||||||
/*.kv_overrides =*/ nullptr,
|
/*.kv_overrides =*/ nullptr,
|
||||||
/*.custom_quants =*/ nullptr,
|
/*.custom_quants =*/ nullptr,
|
||||||
|
|||||||
Reference in New Issue
Block a user