mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 23:24:13 +00:00
WIP
This commit is contained in:
@@ -1756,6 +1756,15 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
return type_traits[type];
|
||||
}
|
||||
|
||||
static inline int ggml_packed_rows(enum ggml_type type) {
|
||||
return type == GGML_TYPE_BF16_R16 ? 16
|
||||
: type == GGML_TYPE_Q8_K_R8 || type == GGML_TYPE_Q8_KV_R8 ||
|
||||
type == GGML_TYPE_Q8_0_R8 || type == GGML_TYPE_Q4_0_R8 ||
|
||||
type == GGML_TYPE_IQ4_XS_R8 ? 8
|
||||
: type >= GGML_TYPE_Q4_0_R8 && type <= GGML_TYPE_Q8_K_R8 ? 4
|
||||
: 1;
|
||||
}
|
||||
|
||||
//
|
||||
// simd mappings
|
||||
//
|
||||
@@ -10441,10 +10450,15 @@ static void ggml_compute_forward_dup_bytes(
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
const int n_packed = ggml_packed_rows(dst->type);
|
||||
GGML_ASSERT(nr%n_packed == 0);
|
||||
const int nrp = nr/n_packed;
|
||||
// number of rows per thread
|
||||
const int dr = (nr + nth - 1) / nth;
|
||||
const int drp = (nrp + nth - 1) / nth;
|
||||
const int dr = drp*n_packed;
|
||||
// row range for this thread
|
||||
const int ir0 = dr * ith;
|
||||
if (ir0 >= nr) return;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
if (src0->type == dst->type &&
|
||||
@@ -10569,10 +10583,18 @@ static void ggml_compute_forward_dup_q(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
GGML_ASSERT(ggml_is_quantized(dst->src[0]->type));
|
||||
|
||||
int64_t nrows = ggml_nrows(dst);
|
||||
int ith = params->ith;
|
||||
int nth = params->nth;
|
||||
|
||||
if (dst->src[0]->type == dst->type &&
|
||||
dst->nb[0] == ggml_type_size(dst->type)) {
|
||||
ggml_compute_forward_dup_bytes(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (dst->type == GGML_TYPE_Q8_0 && dst->src[0]->type == GGML_TYPE_Q8_0 &&
|
||||
ggml_are_same_shape(dst, dst->src[0])) {
|
||||
|
||||
@@ -10637,12 +10659,15 @@ static void ggml_compute_forward_dup_q(
|
||||
ggml_to_float_t to_float = type_traits[src0->type].to_float;
|
||||
GGML_ASSERT(to_float != NULL);
|
||||
|
||||
int64_t n_per_thread = (nrows + nth - 1)/nth;
|
||||
int n_packed = ggml_packed_rows(src0->type);
|
||||
GGML_ASSERT(src0->ne[1] % n_packed == 0);
|
||||
|
||||
int64_t n_per_thread = n_packed*((nrows/n_packed + nth - 1)/nth);
|
||||
int64_t first_row = ith*n_per_thread;
|
||||
if (first_row >= nrows) return;
|
||||
int64_t last_row = MIN(first_row + n_per_thread, nrows);
|
||||
|
||||
for (int64_t ir = first_row; ir < last_row; ++ir) {
|
||||
for (int64_t ir = first_row; ir < last_row; ir += n_packed) {
|
||||
int64_t i03 = ir/(src0->ne[1]*src0->ne[2]);
|
||||
int64_t i02 = (ir - i03*src0->ne[1]*src0->ne[2])/src0->ne[1];
|
||||
int64_t i01 = ir - i03*src0->ne[1]*src0->ne[2] - i02*src0->ne[1];
|
||||
@@ -10653,7 +10678,7 @@ static void ggml_compute_forward_dup_q(
|
||||
const char * q = (const char *)src0->data + i03*src0->nb[3] + i02*src0->nb[2] + i01*src0->nb[1];
|
||||
char * f = ( char *)dst->data + i3* dst->nb[3] + i2* dst->nb[2] + i1* dst->nb[1];
|
||||
|
||||
to_float((const void *)q, (float *)f, src0->ne[0]);
|
||||
to_float((const void *)q, (float *)f, src0->ne[0]*n_packed);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -8218,6 +8218,9 @@ static bool llm_load_tensors(
|
||||
ggml_set_name(l.computed_wk_b.get(), name.c_str());
|
||||
ggml_backend_buffer_set_usage(l.computed_wk_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
ggml_backend_tensor_set(l.computed_wk_b.get(), wk_b->data, 0, ggml_nbytes(wk_b));
|
||||
if (ggml_backend_buffer_is_host(l.computed_wk_b->buffer)) {
|
||||
iqk_modify_tensor(l.computed_wk_b.get());
|
||||
}
|
||||
|
||||
l.wk_b = l.computed_wk_b.get();
|
||||
|
||||
@@ -8243,6 +8246,9 @@ static bool llm_load_tensors(
|
||||
ggml_set_name(l.computed_wv_b.get(), name.c_str());
|
||||
ggml_backend_buffer_set_usage(l.computed_wv_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
ggml_backend_tensor_set(l.computed_wv_b.get(), wv_b->data, 0, ggml_nbytes(wv_b));
|
||||
if (ggml_backend_buffer_is_host(l.computed_wv_b->buffer)) {
|
||||
iqk_modify_tensor(l.computed_wv_b.get());
|
||||
}
|
||||
|
||||
l.wv_b = l.computed_wv_b.get();
|
||||
|
||||
@@ -17315,7 +17321,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out, ml.meta);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
|
||||
|
||||
// Remove split metadata
|
||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
||||
@@ -17370,9 +17375,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
printf("=========================== %s: nothing to do for only_repack option\n", __func__);
|
||||
return;
|
||||
}
|
||||
ftype = repacked_ftype(ftype);
|
||||
ftype = repacked_ftype(model.ftype);
|
||||
printf("===================== Model ftype: %s: Repacked ftype: %s\n", llama_model_ftype_name(model.ftype).c_str(),
|
||||
llama_model_ftype_name(ftype).c_str());
|
||||
}
|
||||
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
|
||||
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// sanity checks
|
||||
|
||||
Reference in New Issue
Block a user