mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 19:31:48 +00:00
Be able to repack tensors at run time (#147)
* Be able to repack tensors at run time * Repack: also add bf16 as repackable type * Repack: make sure number of rows is a multiple of the packing --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -21,6 +21,9 @@
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <unordered_map>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -5054,3 +5057,79 @@ void vec_dot_iq2_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
|
||||
GGML_UNUSED(by);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct Repack {
|
||||
using repack_func = void (*) (int nrows, int n_per_row, const char * src, char * dst);
|
||||
ggml_type new_type;
|
||||
int num_rows;
|
||||
repack_func repack;
|
||||
};
|
||||
}
|
||||
|
||||
void iqk_repack_tensor(struct ggml_tensor * tensor) {
|
||||
constexpr int kChunk = 8;
|
||||
if (!tensor) return;
|
||||
if (!ggml_is_contiguous(tensor)) return;
|
||||
if (strncmp(tensor->name, "token_embd.weight", GGML_MAX_NAME) == 0) return;
|
||||
if (tensor->ne[1] % 4 || tensor->ne[2]*tensor->ne[3] > 1) return;
|
||||
static const std::unordered_map<ggml_type, Repack> k_map = {
|
||||
{ GGML_TYPE_IQ2_K, { GGML_TYPE_IQ2_K_R4, 4, (Repack::repack_func)repack_iq2_k} },
|
||||
{ GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} },
|
||||
{ GGML_TYPE_IQ4_K, { GGML_TYPE_IQ4_K_R4, 4, (Repack::repack_func)repack_iq4_k} },
|
||||
{ GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 4, (Repack::repack_func)repack_iq4_xs} },
|
||||
{ GGML_TYPE_IQ4_NL, { GGML_TYPE_IQ4_NL_R4, 4, (Repack::repack_func)repack_iq4_nl} },
|
||||
{ GGML_TYPE_IQ2_BN, { GGML_TYPE_IQ2_BN_R4, 4, (Repack::repack_func)repack_iq2_bn} },
|
||||
{ GGML_TYPE_Q2_K, { GGML_TYPE_Q2_K_R4, 4, (Repack::repack_func)repack_q2_k} },
|
||||
{ GGML_TYPE_Q3_K, { GGML_TYPE_Q3_K_R4, 4, (Repack::repack_func)repack_q3_k} },
|
||||
{ GGML_TYPE_Q4_K, { GGML_TYPE_Q4_K_R4, 4, (Repack::repack_func)repack_q4_k} },
|
||||
{ GGML_TYPE_Q5_K, { GGML_TYPE_Q5_K_R4, 4, (Repack::repack_func)repack_q5_k} },
|
||||
{ GGML_TYPE_Q6_K, { GGML_TYPE_Q6_K_R4, 4, (Repack::repack_func)repack_q6_k} },
|
||||
{ GGML_TYPE_Q4_0, { GGML_TYPE_Q4_0_R4, 4, (Repack::repack_func)repack_q4_0} },
|
||||
{ GGML_TYPE_Q5_0, { GGML_TYPE_Q5_0_R4, 4, (Repack::repack_func)repack_q5_0} },
|
||||
{ GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} },
|
||||
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
|
||||
{ GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
|
||||
#ifdef __AVX512BF16__
|
||||
{ GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>} },
|
||||
#endif
|
||||
};
|
||||
|
||||
auto it = k_map.find(tensor->type);
|
||||
if (it == k_map.end()) return;
|
||||
if (tensor->ne[1] % it->second.num_rows) return;
|
||||
|
||||
auto& r = it->second;
|
||||
|
||||
int max_thread = std::max(1, int(std::thread::hardware_concurrency()/2));
|
||||
int num_chunks = (tensor->ne[1] + kChunk*r.num_rows - 1)/(kChunk*r.num_rows);
|
||||
int nthread = std::min(num_chunks, max_thread);
|
||||
|
||||
//printf("%s(%s): %s -> %s. %d rows, %d chunks, %d threads\n", __func__, tensor->name, ggml_type_name(tensor->type), ggml_type_name(r.new_type),
|
||||
// int(tensor->ne[1]), num_chunks, nthread);
|
||||
|
||||
std::atomic<int> counter(0);;
|
||||
auto compute = [&counter, &r, tensor, num_chunks] () {
|
||||
int nrows = tensor->ne[1];
|
||||
int n_per_row = tensor->ne[0];
|
||||
auto row_size = ggml_row_size(tensor->type, n_per_row);
|
||||
std::vector<char> qtmp(r.num_rows*row_size);
|
||||
auto data = (char *)tensor->data;
|
||||
while (true) {
|
||||
int chunk = counter.fetch_add(1);
|
||||
if (chunk >= num_chunks) break;
|
||||
int first_row = chunk*kChunk*r.num_rows;
|
||||
int last_row = std::min(first_row + kChunk*r.num_rows, nrows);
|
||||
for (int row = first_row; row < last_row; row += r.num_rows) {
|
||||
std::memcpy(qtmp.data(), data + row*row_size, r.num_rows*row_size);
|
||||
r.repack(r.num_rows, n_per_row, qtmp.data(), data + row*row_size);
|
||||
}
|
||||
}
|
||||
};
|
||||
std::vector<std::thread> workers(nthread-1);
|
||||
for (auto& w : workers) w = std::thread(compute);
|
||||
compute();
|
||||
for (auto& w : workers) w.join();
|
||||
|
||||
tensor->type = r.new_type;
|
||||
}
|
||||
|
||||
|
||||
@@ -173,6 +173,8 @@ void quantize_row_q8_KR8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
||||
void repack_f32_bf16_r16 (const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
|
||||
void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row);
|
||||
|
||||
void iqk_repack_tensor(struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user