Be able to repack tensors at run time (#147)

* Be able to repack tensors at run time

* Repack: also add bf16 as repackable type

* Repack: make sure number of rows is a multiple of the packing

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2024-12-17 14:16:34 +01:00
committed by GitHub
parent 4ade4c568c
commit 514ae08620
8 changed files with 146 additions and 6 deletions

View File

@@ -21,6 +21,9 @@
#include <algorithm>
#include <cstring>
#include <mutex>
#include <thread>
#include <atomic>
#include <unordered_map>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -5054,3 +5057,79 @@ void vec_dot_iq2_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
GGML_UNUSED(by);
}
namespace {
struct Repack {
using repack_func = void (*) (int nrows, int n_per_row, const char * src, char * dst);
ggml_type new_type;
int num_rows;
repack_func repack;
};
}
void iqk_repack_tensor(struct ggml_tensor * tensor) {
constexpr int kChunk = 8;
if (!tensor) return;
if (!ggml_is_contiguous(tensor)) return;
if (strncmp(tensor->name, "token_embd.weight", GGML_MAX_NAME) == 0) return;
if (tensor->ne[1] % 4 || tensor->ne[2]*tensor->ne[3] > 1) return;
static const std::unordered_map<ggml_type, Repack> k_map = {
{ GGML_TYPE_IQ2_K, { GGML_TYPE_IQ2_K_R4, 4, (Repack::repack_func)repack_iq2_k} },
{ GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} },
{ GGML_TYPE_IQ4_K, { GGML_TYPE_IQ4_K_R4, 4, (Repack::repack_func)repack_iq4_k} },
{ GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 4, (Repack::repack_func)repack_iq4_xs} },
{ GGML_TYPE_IQ4_NL, { GGML_TYPE_IQ4_NL_R4, 4, (Repack::repack_func)repack_iq4_nl} },
{ GGML_TYPE_IQ2_BN, { GGML_TYPE_IQ2_BN_R4, 4, (Repack::repack_func)repack_iq2_bn} },
{ GGML_TYPE_Q2_K, { GGML_TYPE_Q2_K_R4, 4, (Repack::repack_func)repack_q2_k} },
{ GGML_TYPE_Q3_K, { GGML_TYPE_Q3_K_R4, 4, (Repack::repack_func)repack_q3_k} },
{ GGML_TYPE_Q4_K, { GGML_TYPE_Q4_K_R4, 4, (Repack::repack_func)repack_q4_k} },
{ GGML_TYPE_Q5_K, { GGML_TYPE_Q5_K_R4, 4, (Repack::repack_func)repack_q5_k} },
{ GGML_TYPE_Q6_K, { GGML_TYPE_Q6_K_R4, 4, (Repack::repack_func)repack_q6_k} },
{ GGML_TYPE_Q4_0, { GGML_TYPE_Q4_0_R4, 4, (Repack::repack_func)repack_q4_0} },
{ GGML_TYPE_Q5_0, { GGML_TYPE_Q5_0_R4, 4, (Repack::repack_func)repack_q5_0} },
{ GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} },
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
{ GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
#ifdef __AVX512BF16__
{ GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>} },
#endif
};
auto it = k_map.find(tensor->type);
if (it == k_map.end()) return;
if (tensor->ne[1] % it->second.num_rows) return;
auto& r = it->second;
int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2));
int num_chunks = (tensor->ne[1] + kChunk*r.num_rows - 1)/(kChunk*r.num_rows);
int nthread = std::min(num_chunks, max_thread);
//printf("%s(%s): %s -> %s. %d rows, %d chunks, %d threads\n", __func__, tensor->name, ggml_type_name(tensor->type), ggml_type_name(r.new_type),
// int(tensor->ne[1]), num_chunks, nthread);
std::atomic<int> counter(0);;
auto compute = [&counter, &r, tensor, num_chunks] () {
int nrows = tensor->ne[1];
int n_per_row = tensor->ne[0];
auto row_size = ggml_row_size(tensor->type, n_per_row);
std::vector<char> qtmp(r.num_rows*row_size);
auto data = (char *)tensor->data;
while (true) {
int chunk = counter.fetch_add(1);
if (chunk >= num_chunks) break;
int first_row = chunk*kChunk*r.num_rows;
int last_row = std::min(first_row + kChunk*r.num_rows, nrows);
for (int row = first_row; row < last_row; row += r.num_rows) {
std::memcpy(qtmp.data(), data + row*row_size, r.num_rows*row_size);
r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size);
}
}
};
std::vector<std::thread> workers(nthread-1);
for (auto& w : workers) w = std::thread(compute);
compute();
for (auto& w : workers) w.join();
tensor->type = r.new_type;
}

View File

@@ -173,6 +173,8 @@ void quantize_row_q8_KR8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
void iqk_repack_tensor(struct ggml_tensor * tensor);
#ifdef __cplusplus
}
#endif