mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-09 16:00:12 +00:00
CUDA: muh faster prompt processing for MoE models and small u-batch sizes (#728)
* WIP: adding mainline mmq_id implementation * This seems to work * Now also -fmoe works * WIP * WIP * WIP * This works for mainline supported quants * mmq_id: add iq2_k, iq2_k_r4 * mmiq_id: don't assume row size is multiple of type size (per row scales) * mmiq_id: don't assume row size is multiple of type size * mmq_id: add iq2_ks So we are sure it works with per row scales * mmq_id: add iq2_kl * mmq_id: add iq3_ks * mmq_id: adding iq3_k, iq3_k_r4 * mmq_id: add iq4_kss, iq4_ks, iq4_ks_r4 * mmq_id: adding iq4_k, iq4_k_r4 * mmq_id: adding iq5_ks, iq5_ks_r4 * mmq_id: adding iq5_k, iq5_k_r4, q6_0 * mmq_id: adding iq6_k * mmq_id: add iq1_s_r4 * mmq_id: adding iq1_kt, iq2_kt * mmq_id: add iq3_kt, iq4_kt * Add CUDA fp8 header --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -39,6 +39,8 @@
|
||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||
#include "ggml-cuda/add-id.cuh"
|
||||
#include "ggml-cuda/graph.cuh"
|
||||
#include "ggml-cuda/mmq_id.cuh"
|
||||
#include "ggml-cuda/quantize_id.cuh"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@@ -2316,7 +2318,7 @@ static inline bool prepare_row_mappigs(ggml_backend_cuda_context& ctx, int64_t n
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(dev_row_mapping.get(), rmapping.data(),
|
||||
cum_moe_counts[n_as]*sizeof(mmid_row_mapping), cudaMemcpyHostToDevice, stream));
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
//CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
return is_ser;
|
||||
}
|
||||
@@ -2392,6 +2394,11 @@ static bool ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(src0->type) && ggml_cuda_can_use_mmq_id(src0->type, ggml_cuda_info().devices[ctx.device].cc, src1->ne[2])) {
|
||||
ggml_cuda_mul_mat_q_id(ctx, src0, src1, ids, dst, nullptr, nullptr);
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
||||
@@ -2662,16 +2669,85 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_1->buffer) && "mul_mat_id does not support split buffers");
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_2->buffer) && "mul_mat_id does not support split buffers");
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t n_as = ne02;
|
||||
const int64_t n_ids = ids->ne[0];
|
||||
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
if (src1->ne[2] <= 2048 && // TODO: this depends on number of total vs number of active experts -> need to find optimum threshod
|
||||
ggml_is_quantized(src0_1->type) && src0_1->type == src0_2->type && src1->ne[1] == 1 && src1->ne[3] == 1 &&
|
||||
ggml_cuda_can_use_mmq_id(src0_1->type, ggml_cuda_info().devices[ctx.device].cc, src1->ne[2])) {
|
||||
|
||||
const int64_t ne_get_rows = ne12 * n_ids;
|
||||
ggml_cuda_pool_alloc<int32_t> ids_device(ctx.pool(), ne_get_rows + ne_get_rows + n_as + 1);
|
||||
auto ids_src1 = ids_device.get();
|
||||
auto ids_dst = ids_src1 + ne_get_rows;
|
||||
auto expert_bounds = ids_dst + ne_get_rows;
|
||||
|
||||
compute_row_ids((const int32_t *)ids->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_ids, ne11, nb11, nb12, ids->nb[1], stream);
|
||||
|
||||
const int64_t ne11_flat = ne12*n_ids;
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
size_t nbytes_src1_q8_1 = ne11_flat*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
get_mmq_x_max_host(ggml_cuda_info().devices[ctx.device].cc)*sizeof(block_q8_1_mmq);
|
||||
ggml_cuda_pool_alloc<char> src1_quantized(ctx.pool(), nbytes_src1_q8_1);
|
||||
|
||||
size_t ts_src1 = ggml_type_size(src1->type);
|
||||
quantize_mmq_q8_1_cuda_id((const float *)src1->data, ids_src1, src1_quantized.get(),
|
||||
src0_1->type, ne10, src1->nb[1] / ts_src1, src1->nb[2] / ts_src1, src1->nb[2] / ts_src1,
|
||||
ne10_padded, ne11_flat, 1, 1, stream);
|
||||
|
||||
ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
|
||||
dst_row.data = dst_up_contiguous.get();
|
||||
ggml_cuda_mul_mat_q_id(ctx, src0_1, src1, ids, &dst_row, (char *)ids_device.get(), src1_quantized.get());
|
||||
if (dst->src[4]) {
|
||||
ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[4]->data, (const int32_t *)ids->data,
|
||||
(float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
|
||||
dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
dst_row.data = dst_gate_contiguous.get();
|
||||
ggml_cuda_mul_mat_q_id(ctx, src0_2, src1, ids, &dst_row, (char *)ids_device.get(), src1_quantized.get());
|
||||
if (dst->src[5]) {
|
||||
ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data,
|
||||
(float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
|
||||
dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto unary_op = (ggml_unary_op)dst->op_params[0];
|
||||
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
|
||||
ggml_swiglu_oai_cuda_f32((const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst->data, ggml_nelements(dst), dst_row.ne[0], dst_row.ne[0], dst_row.ne[0],
|
||||
1.702f, 7.0f, stream);
|
||||
} else {
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst->data);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) &&
|
||||
ggml_cuda_should_use_mmq(next->src[0]->type, ggml_cuda_info().devices[ctx.device].cc, src1->ne[2])) {
|
||||
//ggml_cuda_mul_mat_q_id(ctx, next->src[0], dst, ids, next, (char *)ids_device.get(), nullptr);
|
||||
ggml_cuda_mul_mat_q_id(ctx, next->src[0], dst, ids, next, nullptr, nullptr);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
const char * ids_dev = (const char *) ids->data;
|
||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||
@@ -2680,7 +2756,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
ggml_tensor src0_1_row = *src0_1;
|
||||
ggml_tensor src0_2_row = *src0_2;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
ggml_tensor final_dst;
|
||||
ggml_tensor final_src;
|
||||
|
||||
@@ -2723,222 +2798,170 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
final_src.nb[3] = final_src.nb[2];
|
||||
}
|
||||
|
||||
if (false && ne12 == 1) {
|
||||
ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*dst_row.ne[0]);
|
||||
ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*dst_row.ne[0]);
|
||||
if (fuse_down) {
|
||||
final_dst.src[1] = &dst_row;
|
||||
ggml_cuda_pool_alloc<char> src1_quantized(ctx.pool());
|
||||
bool use_quantized_src1 = false;
|
||||
int64_t src1_padded_num_cols = 0, src1_padded_row_size = 0, src1_quantized_size = 0;
|
||||
if (ggml_is_quantized(src0_1->type) && src0_1->type == src0_2->type && src1->ne[1] == 1 && src1->ne[3] == 1) {
|
||||
if (ggml_cuda_should_use_mmq(src0_1->type, ggml_cuda_info().devices[ctx.device].cc, src1->ne[2])) {
|
||||
src1_padded_num_cols = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
|
||||
src1_padded_row_size = src1_padded_num_cols/ggml_blck_size(GGML_TYPE_Q8_1)*ggml_type_size(GGML_TYPE_Q8_1);
|
||||
src1_quantized_size = src1_padded_row_size*src1->ne[2] + get_mmq_x_max_host(ggml_cuda_info().devices[ctx.device].cc)*sizeof(block_q8_1_mmq);
|
||||
src1_quantized.alloc(src1_quantized_size);
|
||||
use_quantized_src1 = true;
|
||||
}
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
const int32_t i02 = *(const int32_t *) (ids_host.data() + id*ids->nb[0]);
|
||||
}
|
||||
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool());
|
||||
if (!use_quantized_src1) {
|
||||
src1_contiguous.alloc(sizeof(float)*ggml_nelements(src1));
|
||||
}
|
||||
ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
ggml_cuda_pool_alloc<char> final_dst_contiguous(ctx.pool());
|
||||
if (fuse_down) {
|
||||
final_dst.data = final_dst_contiguous.alloc(ggml_nelements(next));
|
||||
final_dst.src[1] = &dst_row;
|
||||
}
|
||||
|
||||
if (i02 < 0 || i02 >= n_as) continue;
|
||||
//GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
src1_row.data = src1_contiguous.get();
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = 0;
|
||||
bool first = false; //true;
|
||||
|
||||
const int64_t i1 = id;
|
||||
const int64_t i2 = i12;
|
||||
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool());
|
||||
std::vector<int> moe_counts, cum_moe_counts;
|
||||
|
||||
src0_1_row.data = src0_1_original + i02*nb02;
|
||||
src0_2_row.data = src0_2_original + i02*nb02;
|
||||
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
||||
//dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
||||
bool is_ser = prepare_row_mappigs(ctx, n_as, n_ids, ids, moe_counts, cum_moe_counts, dev_row_mapping);
|
||||
if (is_ser) {
|
||||
if (fuse_down) {
|
||||
CUDA_CHECK(cudaMemsetAsync(next->data, 0, ggml_nbytes(next), stream));
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
|
||||
}
|
||||
}
|
||||
|
||||
dst_row.data = dst_up_contiguous.get();
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
int64_t num_src1_rows = moe_counts[i02];
|
||||
|
||||
if (num_src1_rows == 0) continue;
|
||||
size_t mapping_offset = cum_moe_counts[i02];
|
||||
|
||||
if (use_quantized_src1) {
|
||||
quantize_mmq_q8_1_id_cuda((const float *)src1->data, src1_quantized.get(), (const char *)(dev_row_mapping.get() + mapping_offset),
|
||||
src1->ne[0], num_src1_rows, src1_padded_num_cols, src0_1->type, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
src1_row.data = src1_quantized.get();
|
||||
}
|
||||
else {
|
||||
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_src_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
src1_original, src1_contiguous.get(), dev_row_mapping.get() + mapping_offset, ne10, ne11, nb11, nb12);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
src1_row.data = src1_contiguous.get();
|
||||
}
|
||||
|
||||
src0_1_row.data = src0_1_original + i02*nb02;
|
||||
src0_2_row.data = src0_2_original + i02*nb02;
|
||||
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
|
||||
src1_row.ne[1] = num_src1_rows;
|
||||
src1_row.nb[1] = use_quantized_src1 ? src1_padded_row_size : nb11;
|
||||
src1_row.nb[2] = num_src1_rows*src1_row.nb[1];
|
||||
src1_row.nb[3] = num_src1_rows*src1_row.nb[1];
|
||||
|
||||
dst_row.ne[1] = num_src1_rows;
|
||||
dst_row.nb[1] = nb1;
|
||||
dst_row.nb[2] = num_src1_rows*nb1;
|
||||
dst_row.nb[3] = num_src1_rows*nb1;
|
||||
|
||||
dst_row.data = dst_up_contiguous.get();
|
||||
if (use_quantized_src1) {
|
||||
ggml_cuda_op_mul_mat_q(ctx, &src0_1_row, &src1_row, &dst_row, (const char *)src0_1_row.data, nullptr, src1_quantized.get(), (float *)dst_row.data,
|
||||
0, src0_1_row.ne[1], num_src1_rows, src1_padded_num_cols, stream);
|
||||
} else {
|
||||
ggml_cuda_mul_mat(ctx, &src0_1_row, &src1_row, &dst_row);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
dst_row.data = dst_gate_contiguous.get();
|
||||
if (dst->src[4]) {
|
||||
dim3 block_dims(std::min(uint32_t(dst_row.ne[0]), 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_quick_add<<<grid_dims, block_dims, 0, stream>>>(dst_row.ne[0], (const float *)dst_row.data,
|
||||
(const float *)((const char *)dst->src[4]->data + i02*dst->src[4]->nb[1]), (float *)dst_row.data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
dst_row.data = dst_gate_contiguous.get();
|
||||
if (use_quantized_src1) {
|
||||
ggml_cuda_op_mul_mat_q(ctx, &src0_2_row, &src1_row, &dst_row, (const char *)src0_2_row.data, nullptr, src1_quantized.get(), (float *)dst_row.data,
|
||||
0, src0_2_row.ne[1], num_src1_rows, src1_padded_num_cols, stream);
|
||||
} else {
|
||||
ggml_cuda_mul_mat(ctx, &src0_2_row, &src1_row, &dst_row);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (dst->src[5]) {
|
||||
dim3 block_dims(std::min(uint32_t(dst_row.ne[0]), 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_quick_add<<<grid_dims, block_dims, 0, stream>>>(dst_row.ne[0], (const float *)dst_row.data,
|
||||
(const float *)((const char *)dst->src[5]->data + i02*dst->src[5]->nb[1]), (float *)dst_row.data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (fuse_down) {
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], dst_row.ne[0],
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
final_src.data = (char *)next->src[0]->data + i02*next->src[0]->nb[2];
|
||||
final_dst.data = (char *)next->data + i1*next->nb[1] + i2*next->nb[2];
|
||||
ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
} else {
|
||||
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], dst_row.ne[0],
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)(dst_original + i1*nb1 + i2*nb2));
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//printf("ne10 = %ld, ne11 = %ld, ne12 = %ld, nb10 = %zu nb11 = %zu nb12 = %zu\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], src1->nb[1], src1->nb[2]);
|
||||
ggml_cuda_pool_alloc<char> src1_quantized(ctx.pool());
|
||||
bool use_quantized_src1 = false;
|
||||
int64_t src1_padded_num_cols = 0, src1_padded_row_size = 0, src1_quantized_size = 0;
|
||||
if (ggml_is_quantized(src0_1->type) && src0_1->type == src0_2->type && src1->ne[1] == 1 && src1->ne[3] == 1) {
|
||||
if (ggml_cuda_should_use_mmq(src0_1->type, ggml_cuda_info().devices[ctx.device].cc, src1->ne[2])) {
|
||||
src1_padded_num_cols = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
|
||||
src1_padded_row_size = src1_padded_num_cols/ggml_blck_size(GGML_TYPE_Q8_1)*ggml_type_size(GGML_TYPE_Q8_1);
|
||||
src1_quantized_size = src1_padded_row_size*src1->ne[2] + get_mmq_x_max_host(ggml_cuda_info().devices[ctx.device].cc)*sizeof(block_q8_1_mmq);
|
||||
src1_quantized.alloc(src1_quantized_size);
|
||||
use_quantized_src1 = true;
|
||||
}
|
||||
|
||||
auto unary_op = (ggml_unary_op)dst->op_params[0];
|
||||
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
|
||||
ggml_swiglu_oai_cuda_f32((const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst_gate_contiguous.get(), ggml_nelements(&dst_row), dst_row.ne[0], dst_row.ne[0], dst_row.ne[0],
|
||||
1.702f, 7.0f, stream);
|
||||
} else {
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst_gate_contiguous.get());
|
||||
}
|
||||
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool());
|
||||
if (!use_quantized_src1) {
|
||||
src1_contiguous.alloc(sizeof(float)*ggml_nelements(src1));
|
||||
}
|
||||
ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
|
||||
ggml_cuda_pool_alloc<char> final_dst_contiguous(ctx.pool());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (fuse_down) {
|
||||
final_dst.data = final_dst_contiguous.alloc(ggml_nelements(next));
|
||||
final_dst.src[1] = &dst_row;
|
||||
|
||||
final_dst.ne[1] = num_src1_rows;
|
||||
final_dst.nb[1] = final_dst.ne[0]*sizeof(float);
|
||||
final_dst.nb[2] = final_dst.nb[3] = num_src1_rows*final_dst.nb[1];
|
||||
final_src.data = (char *)next->src[0]->data + i02*next->src[0]->nb[2];
|
||||
if (first) {
|
||||
printf("Fusing down for %d rows: (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n", (int)num_src1_rows,
|
||||
(int)next->ne[0], (int)next->ne[1], (int)next->ne[2], (int)next->ne[3],
|
||||
(int)next->src[0]->ne[0], (int)next->src[0]->ne[1], (int)next->src[0]->ne[2], (int)next->src[0]->ne[3],
|
||||
(int)next->src[1]->ne[0], (int)next->src[1]->ne[1], (int)next->src[1]->ne[2], (int)next->src[1]->ne[3]);
|
||||
printf(" using (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n",
|
||||
(int)final_dst.ne[0], (int)final_dst.ne[1], (int)final_dst.ne[2], (int)final_dst.ne[3],
|
||||
(int)final_src.ne[0], (int)final_src.ne[1], (int)final_src.ne[2], (int)final_src.ne[3],
|
||||
(int)dst_row.ne[0], (int)dst_row.ne[1], (int)dst_row.ne[2], (int)dst_row.ne[3]);
|
||||
first = false;
|
||||
}
|
||||
ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst);
|
||||
//ggml_cuda_mul_mat(ctx, next->src[0], &dst_row, &final_dst);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
dim3 block_dims(std::min((unsigned int)next->ne[0], 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
(char *)next->data, final_dst_contiguous.get(),
|
||||
dev_row_mapping.get() + mapping_offset,
|
||||
next->ne[0],
|
||||
next->nb[1], next->nb[2]);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
}
|
||||
else {
|
||||
|
||||
src1_row.data = src1_contiguous.get();
|
||||
|
||||
bool first = false; //true;
|
||||
|
||||
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool());
|
||||
std::vector<int> moe_counts, cum_moe_counts;
|
||||
|
||||
bool is_ser = prepare_row_mappigs(ctx, n_as, n_ids, ids, moe_counts, cum_moe_counts, dev_row_mapping);
|
||||
if (is_ser) {
|
||||
if (fuse_down) {
|
||||
CUDA_CHECK(cudaMemsetAsync(next->data, 0, ggml_nbytes(next), stream));
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(dst->data, 0, ggml_nbytes(dst), stream));
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
||||
int64_t num_src1_rows = moe_counts[i02];
|
||||
|
||||
if (num_src1_rows == 0) continue;
|
||||
size_t mapping_offset = cum_moe_counts[i02];
|
||||
|
||||
if (use_quantized_src1) {
|
||||
quantize_mmq_q8_1_id_cuda((const float *)src1->data, src1_quantized.get(), (const char *)(dev_row_mapping.get() + mapping_offset),
|
||||
src1->ne[0], num_src1_rows, src1_padded_num_cols, src0_1->type, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
src1_row.data = src1_quantized.get();
|
||||
}
|
||||
else {
|
||||
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_src_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
src1_original, src1_contiguous.get(), dev_row_mapping.get() + mapping_offset, ne10, ne11, nb11, nb12);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
src1_row.data = src1_contiguous.get();
|
||||
}
|
||||
|
||||
src0_1_row.data = src0_1_original + i02*nb02;
|
||||
src0_2_row.data = src0_2_original + i02*nb02;
|
||||
|
||||
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
||||
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
||||
|
||||
src1_row.ne[1] = num_src1_rows;
|
||||
src1_row.nb[1] = use_quantized_src1 ? src1_padded_row_size : nb11;
|
||||
src1_row.nb[2] = num_src1_rows*src1_row.nb[1];
|
||||
src1_row.nb[3] = num_src1_rows*src1_row.nb[1];
|
||||
|
||||
dst_row.ne[1] = num_src1_rows;
|
||||
dst_row.nb[1] = nb1;
|
||||
dst_row.nb[2] = num_src1_rows*nb1;
|
||||
dst_row.nb[3] = num_src1_rows*nb1;
|
||||
|
||||
dst_row.data = dst_up_contiguous.get();
|
||||
if (use_quantized_src1) {
|
||||
ggml_cuda_op_mul_mat_q(ctx, &src0_1_row, &src1_row, &dst_row, (const char *)src0_1_row.data, nullptr, src1_quantized.get(), (float *)dst_row.data,
|
||||
0, src0_1_row.ne[1], num_src1_rows, src1_padded_num_cols, stream);
|
||||
} else {
|
||||
ggml_cuda_mul_mat(ctx, &src0_1_row, &src1_row, &dst_row);
|
||||
}
|
||||
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
dst_original, dst_gate_contiguous.get(),
|
||||
dev_row_mapping.get() + mapping_offset,
|
||||
ne0,
|
||||
nb1, nb2);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (dst->src[4]) {
|
||||
dim3 block_dims(std::min(uint32_t(dst_row.ne[0]), 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_quick_add<<<grid_dims, block_dims, 0, stream>>>(dst_row.ne[0], (const float *)dst_row.data,
|
||||
(const float *)((const char *)dst->src[4]->data + i02*dst->src[4]->nb[1]), (float *)dst_row.data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
dst_row.data = dst_gate_contiguous.get();
|
||||
if (use_quantized_src1) {
|
||||
ggml_cuda_op_mul_mat_q(ctx, &src0_2_row, &src1_row, &dst_row, (const char *)src0_2_row.data, nullptr, src1_quantized.get(), (float *)dst_row.data,
|
||||
0, src0_2_row.ne[1], num_src1_rows, src1_padded_num_cols, stream);
|
||||
} else {
|
||||
ggml_cuda_mul_mat(ctx, &src0_2_row, &src1_row, &dst_row);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (dst->src[5]) {
|
||||
dim3 block_dims(std::min(uint32_t(dst_row.ne[0]), 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_quick_add<<<grid_dims, block_dims, 0, stream>>>(dst_row.ne[0], (const float *)dst_row.data,
|
||||
(const float *)((const char *)dst->src[5]->data + i02*dst->src[5]->nb[1]), (float *)dst_row.data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto unary_op = (ggml_unary_op)dst->op_params[0];
|
||||
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
|
||||
ggml_swiglu_oai_cuda_f32((const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst_gate_contiguous.get(), ggml_nelements(&dst_row), dst_row.ne[0], dst_row.ne[0], dst_row.ne[0],
|
||||
1.702f, 7.0f, stream);
|
||||
} else {
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst_gate_contiguous.get());
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (fuse_down) {
|
||||
|
||||
final_dst.ne[1] = num_src1_rows;
|
||||
final_dst.nb[1] = final_dst.ne[0]*sizeof(float);
|
||||
final_dst.nb[2] = final_dst.nb[3] = num_src1_rows*final_dst.nb[1];
|
||||
final_src.data = (char *)next->src[0]->data + i02*next->src[0]->nb[2];
|
||||
if (first) {
|
||||
printf("Fusing down for %d rows: (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n", (int)num_src1_rows,
|
||||
(int)next->ne[0], (int)next->ne[1], (int)next->ne[2], (int)next->ne[3],
|
||||
(int)next->src[0]->ne[0], (int)next->src[0]->ne[1], (int)next->src[0]->ne[2], (int)next->src[0]->ne[3],
|
||||
(int)next->src[1]->ne[0], (int)next->src[1]->ne[1], (int)next->src[1]->ne[2], (int)next->src[1]->ne[3]);
|
||||
printf(" using (%d x %d x %d x %d) = (%d x %d x %d x %d) * (%d x %d x %d x %d)\n",
|
||||
(int)final_dst.ne[0], (int)final_dst.ne[1], (int)final_dst.ne[2], (int)final_dst.ne[3],
|
||||
(int)final_src.ne[0], (int)final_src.ne[1], (int)final_src.ne[2], (int)final_src.ne[3],
|
||||
(int)dst_row.ne[0], (int)dst_row.ne[1], (int)dst_row.ne[2], (int)dst_row.ne[3]);
|
||||
first = false;
|
||||
}
|
||||
ggml_cuda_mul_mat(ctx, &final_src, &dst_row, &final_dst);
|
||||
//ggml_cuda_mul_mat(ctx, next->src[0], &dst_row, &final_dst);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
dim3 block_dims(std::min((unsigned int)next->ne[0], 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
(char *)next->data, final_dst_contiguous.get(),
|
||||
dev_row_mapping.get() + mapping_offset,
|
||||
next->ne[0],
|
||||
next->nb[1], next->nb[2]);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
}
|
||||
else {
|
||||
|
||||
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
||||
dim3 grid_dims(num_src1_rows);
|
||||
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
||||
dst_original, dst_gate_contiguous.get(),
|
||||
dev_row_mapping.get() + mapping_offset,
|
||||
ne0,
|
||||
nb1, nb2);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
564
ggml/src/ggml-cuda/mmq_id.cu
Normal file
564
ggml/src/ggml-cuda/mmq_id.cu
Normal file
@@ -0,0 +1,564 @@
|
||||
#include "mmq_id_common.cuh"
|
||||
#include "mmq_id.cuh"
|
||||
#include "quantize_id.cuh"
|
||||
|
||||
#include <vector>
|
||||
#include <climits>
|
||||
#include <cstdint>
|
||||
|
||||
// To reduce shared memory use, store "it" and "iex_used" with 22/10 bits each.
|
||||
struct mmq_ids_helper_store {
|
||||
uint32_t data;
|
||||
|
||||
__device__ mmq_ids_helper_store(const uint32_t it, const uint32_t iex_used) {
|
||||
data = (it & 0x003FFFFF) | (iex_used << 22);
|
||||
}
|
||||
|
||||
__device__ uint32_t it() const {
|
||||
return data & 0x003FFFFF;
|
||||
}
|
||||
|
||||
__device__ uint32_t iex_used() const {
|
||||
return data >> 22;
|
||||
}
|
||||
};
|
||||
static_assert(sizeof(mmq_ids_helper_store) == 4, "unexpected size for mmq_ids_helper_store");
|
||||
|
||||
// Helper function for mul_mat_id, converts ids to a more convenient format.
|
||||
// ids_src1 describes how to permute the flattened column indices of src1 in order to get a compact src1 tensor sorted by expert.
|
||||
// ids_dst describes the same mapping but for the dst tensor.
|
||||
// The upper and lower bounds for the ith expert in the compact src1 tensor are stored in expert_bounds[i:i+1].
|
||||
template <int n_expert_used_template>
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mmq_ids_helper(
|
||||
const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds,
|
||||
const int n_tokens, const int n_expert_used_var, const int nchannels_y, const int si1, const int sis1) {
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
const int n_expert_used = n_expert_used_template == 0 ? n_expert_used_var : n_expert_used_template;
|
||||
const int expert = blockIdx.x;
|
||||
|
||||
extern __shared__ char data_mmq_ids_helper[];
|
||||
mmq_ids_helper_store * store = (mmq_ids_helper_store *) data_mmq_ids_helper;
|
||||
|
||||
int nex_prev = 0; // Number of columns for experts with a lower index.
|
||||
int it_compact = 0; // Running index for the compact slice of this expert.
|
||||
|
||||
if constexpr (n_expert_used_template == 0) {
|
||||
// Generic implementation:
|
||||
for (int it = 0; it < n_tokens; ++it) {
|
||||
int iex_used = -1; // The index at which the expert is used, if any.
|
||||
for (int iex = threadIdx.x; iex < n_expert_used; iex += warp_size) {
|
||||
const int expert_used = ids[it*si1 + iex];
|
||||
nex_prev += expert_used < expert;
|
||||
if (expert_used == expert) {
|
||||
iex_used = iex;
|
||||
}
|
||||
}
|
||||
|
||||
if (iex_used != -1) {
|
||||
store[it_compact] = mmq_ids_helper_store(it, iex_used);
|
||||
}
|
||||
|
||||
if (warp_reduce_any<warp_size>(iex_used != -1)) {
|
||||
it_compact++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Implementation optimized for specific numbers of experts used:
|
||||
static_assert(n_expert_used == 6 || warp_size % n_expert_used == 0, "bad n_expert_used");
|
||||
const int neu_padded = n_expert_used == 6 ? 8 : n_expert_used; // Padded to next higher power of 2.
|
||||
for (int it0 = 0; it0 < n_tokens; it0 += warp_size/neu_padded) {
|
||||
const int it = it0 + threadIdx.x / neu_padded;
|
||||
|
||||
const int iex = threadIdx.x % neu_padded; // The index at which the expert is used, if any.
|
||||
const int expert_used = (neu_padded == n_expert_used || iex < n_expert_used) && it < n_tokens ?
|
||||
ids[it*si1 + iex] : INT_MAX;
|
||||
const int iex_used = expert_used == expert ? iex : -1;
|
||||
nex_prev += expert_used < expert;
|
||||
|
||||
// Whether the threads at this token position have used the expert:
|
||||
const int it_compact_add_self = warp_reduce_any<neu_padded>(iex_used != -1);
|
||||
|
||||
// Do a scan over threads at lower token positions in warp to get the correct index for writing data:
|
||||
int it_compact_add_lower = 0;
|
||||
#pragma unroll
|
||||
for (int offset = neu_padded; offset < warp_size; offset += neu_padded) {
|
||||
const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size);
|
||||
if (threadIdx.x >= offset) {
|
||||
it_compact_add_lower += tmp;
|
||||
}
|
||||
}
|
||||
|
||||
if (iex_used != -1) {
|
||||
store[it_compact + it_compact_add_lower] = mmq_ids_helper_store(it, iex_used);
|
||||
}
|
||||
|
||||
// The thread with the highest index in the warp always has the sum over the whole warp, use it to increment all threads:
|
||||
it_compact += __shfl_sync(0xFFFFFFFF, it_compact_add_lower + it_compact_add_self, warp_size - 1, warp_size);
|
||||
}
|
||||
}
|
||||
nex_prev = warp_reduce_sum<warp_size>(nex_prev);
|
||||
|
||||
for (int itc = threadIdx.x; itc < it_compact; itc += warp_size) {
|
||||
const mmq_ids_helper_store store_it = store[itc];
|
||||
const int it = store_it.it();
|
||||
const int iex_used = store_it.iex_used();
|
||||
ids_src1[nex_prev + itc] = it*sis1 + iex_used % nchannels_y;
|
||||
ids_dst [nex_prev + itc] = it*n_expert_used + iex_used;
|
||||
}
|
||||
|
||||
if (threadIdx.x != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
expert_bounds[expert] = nex_prev;
|
||||
|
||||
if (expert < gridDim.x - 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
expert_bounds[gridDim.x] = nex_prev + it_compact;
|
||||
}
|
||||
|
||||
template <int n_expert_used_template>
|
||||
static void launch_mmq_ids_helper(
|
||||
const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds,
|
||||
const int n_experts, const int n_tokens, const int n_expert_used_var, const int nchannels_y, const int si1, const int sis1, cudaStream_t stream) {
|
||||
GGML_ASSERT(n_tokens < (1 << 22) && "too few bits in mmq_ids_helper_store");
|
||||
GGML_ASSERT(n_expert_used_var < (1 << 10) && "too few bits in mmq_ids_helper_store");
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int warp_size = ggml_cuda_get_physical_warp_size_host(); //ggml_cuda_info().devices[id].warp_size;
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT(mmq_ids_helper<n_expert_used_template>, smpbo);
|
||||
|
||||
const dim3 num_blocks(n_experts, 1, 1);
|
||||
const dim3 block_size(warp_size, 1, 1);
|
||||
const size_t nbytes_shared = n_tokens*sizeof(mmq_ids_helper_store);
|
||||
mmq_ids_helper<n_expert_used_template><<<num_blocks, block_size, nbytes_shared, stream>>>
|
||||
(ids, ids_src1, ids_dst, expert_bounds, n_tokens, n_expert_used_var, nchannels_y, si1, sis1);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_q_switch_type_id(ggml_backend_cuda_context & ctx, const mmq_args_id & args, cudaStream_t stream) {
|
||||
switch (args.type_x) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q4_1>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q5_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q5_1>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_0:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q6_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q8_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_MXFP4:
|
||||
mul_mat_q_case_id<GGML_TYPE_MXFP4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q2_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q3_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q4_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q5_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_XS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_S>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ3_S>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ1_S>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S_R4:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ1_S_R4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_NL>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_KS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_KS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_KL:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_KL>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_K_R4:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_K_R4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ3_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_K_R4:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ3_K_R4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_KS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ3_KS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_KSS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_KSS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_KS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_KS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_KS_R4:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_KS_R4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_K_R4:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_K_R4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ5_KS:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ5_KS>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ5_KS_R4:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ5_KS_R4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ5_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ5_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ5_K_R4:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ5_K_R4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ6_K:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ6_K>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_KT:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ1_KT>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_KT:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ2_KT>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_KT:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ3_KT>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_KT:
|
||||
mul_mat_q_case_id<GGML_TYPE_IQ4_KT>(ctx, args, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_row_ids(const int32_t * ids, int32_t * ids_src1, int32_t * ids_dst, int32_t * expert_bounds,
|
||||
int64_t ne02, int64_t ne12, int64_t n_expert_used, int64_t ne11, int64_t nb11, int64_t nb12, int64_t nb21,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int si1 = nb21 / sizeof(int);
|
||||
const int sis1 = nb12 / nb11;
|
||||
|
||||
switch (n_expert_used) {
|
||||
case 2:
|
||||
launch_mmq_ids_helper< 2> (ids, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 4:
|
||||
launch_mmq_ids_helper< 4> (ids, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 6:
|
||||
launch_mmq_ids_helper< 6> (ids, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 8:
|
||||
launch_mmq_ids_helper< 8> (ids, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 16:
|
||||
launch_mmq_ids_helper<16> (ids, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 32:
|
||||
launch_mmq_ids_helper<32> (ids, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
default:
|
||||
launch_mmq_ids_helper< 0> (ids, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_q_id(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
const ggml_tensor * ids_tensor, ggml_tensor * dst, char * ids_data, char * src1_quantized_data) {
|
||||
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ids_tensor->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
//const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
||||
//GGML_ASSERT( nb00 == ts_src0);
|
||||
GGML_ASSERT( nb10 == ts_src1);
|
||||
GGML_ASSERT( nb0 == ts_dst);
|
||||
GGML_ASSERT(ids_tensor->nb[0] == ggml_type_size(ids_tensor->type));
|
||||
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
GGML_ASSERT(nb12 % nb11 == 0);
|
||||
GGML_ASSERT(nb2 % nb1 == 0);
|
||||
|
||||
const char * src0_d = (const char *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const int64_t s01 = src0->nb[1];// / ts_src0;
|
||||
const int64_t s1 = dst->nb[1] / ts_dst;
|
||||
const int64_t s02 = src0->nb[2];// / ts_src0;
|
||||
const int64_t s2 = dst->nb[2] / ts_dst;
|
||||
const int64_t s03 = src0->nb[3];// / ts_src0;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| GGML_CUDA_CC_IS_CDNA(cc);
|
||||
|
||||
const int64_t n_expert_used = ids_tensor->ne[0];
|
||||
const int64_t ne_get_rows = ne12 * n_expert_used;
|
||||
GGML_ASSERT(ne1 == n_expert_used);
|
||||
|
||||
ggml_cuda_pool_alloc<int32_t> ids_src1_local(ctx.pool());
|
||||
ggml_cuda_pool_alloc<int32_t> ids_dst_local(ctx.pool());
|
||||
ggml_cuda_pool_alloc<int32_t> expert_bounds_local(ctx.pool());
|
||||
|
||||
int32_t * ids_src1, *ids_dst, *expert_bounds;
|
||||
if (ids_data) {
|
||||
ids_src1 = (int32_t *)ids_data;
|
||||
ids_dst = ids_src1 + ne_get_rows;
|
||||
expert_bounds = ids_dst + ne_get_rows;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(ids_tensor->nb[0] == ggml_element_size(ids_tensor));
|
||||
|
||||
ids_src1_local.alloc(ne_get_rows);
|
||||
ids_dst_local.alloc(ne_get_rows);
|
||||
expert_bounds_local.alloc(ne02 + 1);
|
||||
|
||||
ids_src1 = ids_src1_local.get();
|
||||
ids_dst = ids_dst_local.get();
|
||||
expert_bounds = expert_bounds_local.get();
|
||||
|
||||
const int si1 = ids_tensor->nb[1] / ggml_element_size(ids_tensor);
|
||||
const int sis1 = nb12 / nb11;
|
||||
|
||||
switch (n_expert_used) {
|
||||
case 2:
|
||||
launch_mmq_ids_helper< 2> ((const int32_t *) ids_tensor->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 4:
|
||||
launch_mmq_ids_helper< 4> ((const int32_t *) ids_tensor->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 6:
|
||||
launch_mmq_ids_helper< 6> ((const int32_t *) ids_tensor->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 8:
|
||||
launch_mmq_ids_helper< 8> ((const int32_t *) ids_tensor->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 16:
|
||||
launch_mmq_ids_helper<16> ((const int32_t *) ids_tensor->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
case 32:
|
||||
launch_mmq_ids_helper<32> ((const int32_t *) ids_tensor->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
default:
|
||||
launch_mmq_ids_helper< 0> ((const int32_t *) ids_tensor->data, ids_src1, ids_dst, expert_bounds,
|
||||
ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
|
||||
break;
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t ne11_flat = ne12*n_expert_used;
|
||||
const int64_t ne12_flat = 1;
|
||||
const int64_t ne13_flat = 1;
|
||||
|
||||
const size_t nbytes_src1_q8_1 = ne11_flat*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
||||
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1_local(ctx.pool());
|
||||
|
||||
char * src1_q8_1;
|
||||
|
||||
if (src1_quantized_data) {
|
||||
src1_q8_1 = src1_quantized_data;
|
||||
} else {
|
||||
|
||||
src1_q8_1_local.alloc(nbytes_src1_q8_1);
|
||||
src1_q8_1 = src1_q8_1_local.get();
|
||||
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[2] / ts_src1;
|
||||
quantize_mmq_q8_1_cuda_id(src1_d, ids_src1, src1_q8_1, src0->type,
|
||||
ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||
const mmq_args_id args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1, ids_dst, expert_bounds, dst_d,
|
||||
ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
|
||||
ne02, ne02, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k, ne12};
|
||||
|
||||
//printf("ne00 = %ld, ne01 = %ld, ne_get_rows = %ld, s01 = %ld, s1 = %ld\n", ne00, ne01, ne_get_rows, s01, s1);
|
||||
//printf("ne02 = %ld, s02 = %ld, s12 = %ld, s2 = %ld\n", ne02, s02, s12, s2);
|
||||
//printf("ne03 = %ld, s03 = %ld, s13 = %ld, s3 = %ld\n", ne03, s03, s13, s3);
|
||||
|
||||
ggml_cuda_mul_mat_q_switch_type_id(ctx, args, stream);
|
||||
}
|
||||
|
||||
bool ggml_cuda_can_use_mmq_id(enum ggml_type type, int cc, int64_t ne11) {
|
||||
bool mmq_supported;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q6_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_S_R4:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ2_KS:
|
||||
case GGML_TYPE_IQ2_KL:
|
||||
case GGML_TYPE_IQ2_K:
|
||||
case GGML_TYPE_IQ2_K_R4:
|
||||
case GGML_TYPE_IQ3_KS:
|
||||
case GGML_TYPE_IQ3_K:
|
||||
case GGML_TYPE_IQ3_K_R4:
|
||||
case GGML_TYPE_IQ4_KSS:
|
||||
case GGML_TYPE_IQ4_KS:
|
||||
case GGML_TYPE_IQ4_KS_R4:
|
||||
case GGML_TYPE_IQ4_K:
|
||||
case GGML_TYPE_IQ4_K_R4:
|
||||
case GGML_TYPE_IQ5_KS:
|
||||
case GGML_TYPE_IQ5_KS_R4:
|
||||
case GGML_TYPE_IQ5_K:
|
||||
case GGML_TYPE_IQ5_K_R4:
|
||||
case GGML_TYPE_IQ6_K:
|
||||
case GGML_TYPE_IQ1_KT:
|
||||
case GGML_TYPE_IQ2_KT:
|
||||
case GGML_TYPE_IQ3_KT:
|
||||
case GGML_TYPE_IQ4_KT:
|
||||
mmq_supported = true;
|
||||
break;
|
||||
default:
|
||||
mmq_supported = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!mmq_supported) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (turing_mma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
return true;
|
||||
#endif //GGML_CUDA_FORCE_MMQ
|
||||
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
if (amd_mfma_available(cc)) {
|
||||
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
|
||||
// performs better but is currently suffering from a crash on this architecture.
|
||||
// TODO: Revisit when hipblaslt is fixed on CDNA3
|
||||
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
||||
return true;
|
||||
}
|
||||
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0
|
||||
|| type == GGML_TYPE_Q5_1 || type == GGML_TYPE_Q6_0) {
|
||||
return true;
|
||||
}
|
||||
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
|
||||
}
|
||||
12
ggml/src/ggml-cuda/mmq_id.cuh
Normal file
12
ggml/src/ggml-cuda/mmq_id.cuh
Normal file
@@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_mul_mat_q_id(
|
||||
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids,
|
||||
ggml_tensor * dst, char * ids_data, char * src1_quantized_data);
|
||||
|
||||
void compute_row_ids(const int32_t * ids, int32_t * ids_src1, int32_t * ids_dst, int32_t * expert_bounds,
|
||||
int64_t ne02, int64_t ne12, int64_t n_expert_used, int64_t ne11, int64_t nb11, int64_t nb12, int64_t nb21, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_can_use_mmq_id(enum ggml_type type, int cc, int64_t ne11);
|
||||
4183
ggml/src/ggml-cuda/mmq_id_common.cuh
Normal file
4183
ggml/src/ggml-cuda/mmq_id_common.cuh
Normal file
File diff suppressed because it is too large
Load Diff
132
ggml/src/ggml-cuda/quantize_id.cu
Normal file
132
ggml/src/ggml-cuda/quantize_id.cu
Normal file
@@ -0,0 +1,132 @@
|
||||
#include "quantize_id.cuh"
|
||||
#include "mmq.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
template <mmq_q8_1_ds_layout ds_layout>
|
||||
static __global__ void quantize_mmq_q8_1(
|
||||
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int ne1, const int ne2) {
|
||||
|
||||
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
|
||||
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
|
||||
|
||||
const int64_t i0 = ((int64_t)blockDim.x*blockIdx.y + threadIdx.x)*4;
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
const int64_t i00 = i0;
|
||||
const int64_t i01 = ids ? ids[i1] : i1;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i03 = i3;
|
||||
|
||||
const float4 * x4 = (const float4 *) x;
|
||||
|
||||
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
||||
|
||||
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.x*gridDim.y*blockDim.x/QK8_1); // first block of channel
|
||||
const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.x; // block index in channel
|
||||
const int64_t iqs = i0 % (4*QK8_1); // quant index in block
|
||||
|
||||
// Load 4 floats per thread and calculate max. abs. value between them:
|
||||
const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
float amax = fabsf(xi.x);
|
||||
amax = fmaxf(amax, fabsf(xi.y));
|
||||
amax = fmaxf(amax, fabsf(xi.z));
|
||||
amax = fmaxf(amax, fabsf(xi.w));
|
||||
|
||||
// Exchange max. abs. value between vals_per_scale/4 threads.
|
||||
#pragma unroll
|
||||
for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {
|
||||
amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE));
|
||||
}
|
||||
|
||||
float sum;
|
||||
if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) {
|
||||
sum = xi.x + xi.y + xi.z + xi.w;
|
||||
|
||||
// Calculate sums across vals_per_sum/4 threads.
|
||||
#pragma unroll
|
||||
for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
|
||||
sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
const float d_inv = 127.0f / amax;
|
||||
char4 q;
|
||||
q.x = roundf(xi.x*d_inv);
|
||||
q.y = roundf(xi.y*d_inv);
|
||||
q.z = roundf(xi.z*d_inv);
|
||||
q.w = roundf(xi.w*d_inv);
|
||||
|
||||
// Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
|
||||
char4 * yqs4 = (char4 *) y[ib].qs;
|
||||
yqs4[iqs/4] = q;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6) {
|
||||
if (iqs % 16 != 0 || iqs >= 96) {
|
||||
return;
|
||||
}
|
||||
|
||||
y[ib].d2s6[2 + iqs/16] = sum;
|
||||
|
||||
if (iqs % 64 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
y[ib].d2s6[iqs/64] = d;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (iqs % 32 != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float d = 1.0f / d_inv;
|
||||
|
||||
if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
|
||||
y[ib].ds4[iqs/32] = make_half2(d, sum);
|
||||
} else {
|
||||
y[ib].d4[iqs/32] = d;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_mmq_q8_1_cuda_id(
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
||||
GGML_ASSERT(ids);
|
||||
|
||||
// ne1 tends to assume the highest values, therefore use it as the "x" dimension of the CUDA grid:
|
||||
const int64_t block_num_y = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(ne1, block_num_y, ne2*ne3);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
|
||||
switch (mmq_get_q8_1_ds_layout(type_src0)) {
|
||||
case MMQ_Q8_1_DS_LAYOUT_D4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_DS4:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
break;
|
||||
case MMQ_Q8_1_DS_LAYOUT_D2S6:
|
||||
quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
|
||||
<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
16
ggml/src/ggml-cuda/quantize_id.cuh
Normal file
16
ggml/src/ggml-cuda/quantize_id.cuh
Normal file
@@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
|
||||
|
||||
//static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
|
||||
//static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
||||
|
||||
void quantize_mmq_q8_1_cuda_id(
|
||||
const float * x, const int32_t * ids, void * vy,
|
||||
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
||||
@@ -0,0 +1,83 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq1_kt(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
int i = i0 + threadIdx.y;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq1_kt * bxi = (const block_iq1_kt *)(x + i*stride + sizeof(float)) + kbx0;
|
||||
|
||||
int ib32 = kqsx/4;
|
||||
int j = kqsx%4;
|
||||
uint32_t val = bxi->ql[kqsx] + ((bxi->qh[kqsx%16] << (8 - 4*(kqsx/16))) & 0xf00) + ((bxi->sh[kqsx/4] << (8 - (kqsx%4))) & 0x1000) + 4096;
|
||||
int2 v = {0, 0};
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val *= ka;
|
||||
v.x |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val *= ka;
|
||||
v.y |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 1] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 1] = v.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
||||
int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const float * dptr = (const float *)(x + i*stride);
|
||||
const float d = dptr[0];
|
||||
const block_iq1_kt * bxi = (const block_iq1_kt *)(dptr + 1) + kbx0;
|
||||
const int ls = iq4k_values[bxi->sh[threadIdx.x % 8] & 0xf];
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * ls;
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + threadIdx.x % 8] = d * ls;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ1_KT> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq1_kt<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ1_KT);
|
||||
@@ -0,0 +1,6 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ1_S_R4);
|
||||
200
ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_k_id.cu
Normal file
200
ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_k_id.cu
Normal file
@@ -0,0 +1,200 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_k(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
//constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
constexpr int qstep = 8;
|
||||
const int kqsx = threadIdx.x % qstep;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/qstep) {
|
||||
int i = i0 + threadIdx.y*(WARP_SIZE/qstep) + threadIdx.x/qstep;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq2_k * bxi = (const block_iq2_k *)(x + i*stride) + kbx0;
|
||||
|
||||
const float d = bxi->d;
|
||||
uint16_t extra = bxi->extra >> (kqsx/4);
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
uint32_t extra32[2] = { uint32_t(extra & 0xff) * 0x01010101, uint32_t(extra >> 8) * 0x01010101 };
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/4; ++l) {
|
||||
const int ql = get_int_b4(bxi->qs, kqsx + qstep*l);
|
||||
uint32_t val1 = ((ql >> 0) & 0x33333333) | ((extra32[l] << 2) & 0x44444444);
|
||||
uint32_t val2 = ((ql >> 2) & 0x33333333) | ((extra32[l] << 0) & 0x44444444);
|
||||
int2 v1 = get_int_from_table_8(val1, iq2nl_values);
|
||||
int2 v2 = get_int_from_table_8(val2, iq2nl_values);
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 0] = v1.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 8] = v2.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 16] = v1.y;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 24] = v2.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 0] = v1.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 8] = v2.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 16] = v1.y;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 24] = v2.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
auto all_values = (const int *)iq2k_table;
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/4; ++l) {
|
||||
|
||||
const int ql = get_int_b4(bxi->qs, kqsx + qstep*l);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 0] = int_from_table_4((ql >> 0) & 0x03030303, all_values + ((extra & 0x01) << 8));
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 8] = int_from_table_4((ql >> 2) & 0x03030303, all_values + ((extra & 0x04) << 6));
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 16] = int_from_table_4((ql >> 4) & 0x03030303, all_values + ((extra & 0x10) << 4));
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 24] = int_from_table_4((ql >> 6) & 0x03030303, all_values + ((extra & 0x40) << 2));
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 0] = int_from_table_4((ql >> 0) & 0x03030303, all_values + ((extra & 0x01) << 8));
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 8] = int_from_table_4((ql >> 2) & 0x03030303, all_values + ((extra & 0x04) << 6));
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 16] = int_from_table_4((ql >> 4) & 0x03030303, all_values + ((extra & 0x10) << 4));
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 24] = int_from_table_4((ql >> 6) & 0x03030303, all_values + ((extra & 0x40) << 2));
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
extra >>= 8;
|
||||
}
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = d * (((bxi->scales[kqsx] >> 0) & 0xf) - 8);
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = d * (((bxi->scales[kqsx] >> 4) & 0xf) - 8);
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = d * (((bxi->scales[kqsx] >> 0) & 0xf) - 8);
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = d * (((bxi->scales[kqsx] >> 4) & 0xf) - 8);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_k_r4(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x/4; // 0...7 -> block of 32
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
int i4 = i/4;
|
||||
int ir = i%4;
|
||||
|
||||
const block_iq2_k_r4 * bxi = (const block_iq2_k_r4 *)(x + 4*i4*stride) + kbx0;
|
||||
|
||||
const float d = __half2float(bxi->d[ir]);
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
|
||||
uint32_t extra = uint32_t((bxi->extra[ir+4*l] >> kqsx) & 1) * 0x04040404;
|
||||
extra = extra | (extra << 4);
|
||||
|
||||
const int ql = get_int_b4(bxi->qs, 8*kqsx + ir + 4*l);
|
||||
uint32_t val1 = ((ql >> 0) & 0x33333333) | extra;
|
||||
uint32_t val2 = ((ql >> 2) & 0x33333333) | extra;
|
||||
int2 v1 = get_int_from_table_8(val1, iq2nl_values);
|
||||
int2 v2 = get_int_from_table_8(val2, iq2nl_values);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 0] = v1.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 1] = v2.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 2] = v1.y;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 3] = v2.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 0] = v1.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 1] = v2.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 2] = v1.y;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 3] = v2.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
|
||||
auto values_l = (const int *)iq2k_table + (((bxi->extra[ir+4*l] >> kqsx) & 1) << 8);
|
||||
|
||||
const int ql = get_int_b4(bxi->qs, 8*kqsx + ir + 4*l);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 0] = int_from_table_4((ql >> 0) & 0x03030303, values_l);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 1] = int_from_table_4((ql >> 2) & 0x03030303, values_l);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 2] = int_from_table_4((ql >> 4) & 0x03030303, values_l);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 3] = int_from_table_4((ql >> 6) & 0x03030303, values_l);
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 0] = int_from_table_4((ql >> 0) & 0x03030303, values_l);
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 1] = int_from_table_4((ql >> 2) & 0x03030303, values_l);
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 2] = int_from_table_4((ql >> 4) & 0x03030303, values_l);
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 3] = int_from_table_4((ql >> 6) & 0x03030303, values_l);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
int is = 8*kqsx + ir;
|
||||
float dl1 = d * (((bxi->scales[is%32] >> 4*(is/32)) & 0xf) - 8);
|
||||
is += 4;
|
||||
float dl2 = d * (((bxi->scales[is%32] >> 4*(is/32)) & 0xf) - 8);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = dl1;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = dl2;
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = dl1;
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = dl2;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_K> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_k<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_K_R4> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_k_r4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_K);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_K_R4);
|
||||
@@ -0,0 +1,72 @@
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_kl(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x/4;
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * a8 = (const uint8_t *)aux32;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const half * dptr = (const half *)(x + i*stride);
|
||||
const float d = *dptr;
|
||||
const block_iq2_kl * bxi = (const block_iq2_kl *)(dptr + 1) + kbx0;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
auto ql = get_int_b2(bxi->qs, 4*(kqsx/2) + 2*(kqsx%2) + j);
|
||||
auto qh = get_int_b2(bxi->qh, 2*(kqsx%2) + j) >> 2*(kqsx/2);
|
||||
aux32[0] = ((ql >> 0) & 0x0f0f0f0f) | ((qh << 4) & 0x10101010);
|
||||
aux32[1] = ((ql >> 4) & 0x0f0f0f0f) | ((qh << 3) & 0x10101010);
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
int val1 = iq2kl_values[a8[2*l+0]] | (iq2kl_values[a8[2*l+1]] << 16);
|
||||
int val2 = iq2kl_values[a8[2*l+4]] | (iq2kl_values[a8[2*l+5]] << 16);
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 0] = val1;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 8] = val2;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 0] = val1;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 16*(kqsx/2) + 4*(kqsx%2) + 2*j + l + 8] = val2;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
int ls = int(((bxi->scales_l[kqsx%4] >> 4*(kqsx/4)) & 0xf) | (((bxi->scales_h >> 2*kqsx) & 3) << 4)) - 32;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * ls;
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = d * ls;
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_KL> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_kl<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_KL);
|
||||
114
ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_ks_id.cu
Normal file
114
ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_ks_id.cu
Normal file
@@ -0,0 +1,114 @@
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_ks(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x%16;
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 2*nwarps) {
|
||||
int i = i0 + 2*threadIdx.y + threadIdx.x/16;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq2_ks * bxi = (const block_iq2_ks *)(x + i*stride + sizeof(half)) + kbx0;
|
||||
|
||||
uint16_t extra = bxi->extra >> 4*(kqsx/8);
|
||||
int q2 = get_int_b2(bxi->qs, kqsx);
|
||||
|
||||
uint32_t extra32 = uint32_t(extra & 0xf) * 0x01010101;
|
||||
uint32_t val1 = ((q2 >> 0) & 0x33333333) | ((extra32 << 2) & 0x04040404) | ((extra32 << 4) & 0x40404040);
|
||||
uint32_t val2 = ((q2 >> 2) & 0x33333333) | ((extra32 << 1) & 0x04040404) | ((extra32 << 3) & 0x40404040);
|
||||
int2 v1 = get_int_from_table_8(val1, iq2nl_values);
|
||||
int2 v2 = get_int_from_table_8(val2, iq2nl_values);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 0] = v1.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 8] = v2.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 16] = v1.y;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 24] = v2.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 0] = v1.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 8] = v2.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 16] = v1.y;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 24] = v2.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#else // __CUDA_ARCH__
|
||||
|
||||
|
||||
const int * all_values = (const int *)iq2k_table;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 2*nwarps) {
|
||||
int i = i0 + 2*threadIdx.y + threadIdx.x/16;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq2_ks * bxi = (const block_iq2_ks *)(x + i*stride + sizeof(half)) + kbx0;
|
||||
|
||||
uint16_t extra = bxi->extra >> 4*(kqsx/8);
|
||||
int q2 = get_int_b2(bxi->qs, kqsx);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 0] = int_from_table_4((q2 >> 0) & 0x03030303, all_values + ((extra & 1) << 8));
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 8] = int_from_table_4((q2 >> 2) & 0x03030303, all_values + ((extra & 2) << 7));
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 16] = int_from_table_4((q2 >> 4) & 0x03030303, all_values + ((extra & 4) << 6));
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx%8 + 32*(kqsx/8) + 24] = int_from_table_4((q2 >> 6) & 0x03030303, all_values + ((extra & 8) << 5));
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 0] = int_from_table_4((q2 >> 0) & 0x03030303, all_values + ((extra & 1) << 8));
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 8] = int_from_table_4((q2 >> 2) & 0x03030303, all_values + ((extra & 2) << 7));
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 16] = int_from_table_4((q2 >> 4) & 0x03030303, all_values + ((extra & 4) << 6));
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx%8 + 32*(kqsx/8) + 24] = int_from_table_4((q2 >> 6) & 0x03030303, all_values + ((extra & 8) << 5));
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
||||
int i = i0 + threadIdx.y * 8 + threadIdx.x / 4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const half * dptr = (const half *)(x + i*stride);
|
||||
const float d = dptr[0];
|
||||
const block_iq2_ks * bxi = (const block_iq2_ks *)(dptr + 1) + kbx0;
|
||||
const int ls1 = ((bxi->scales[threadIdx.x % 4] >> 0) & 0xf) | ((bxi->extra >> (4 + 2*(threadIdx.x % 4))) & 0x10);
|
||||
const int ls2 = ((bxi->scales[threadIdx.x % 4] >> 4) & 0xf) | ((bxi->extra >> (5 + 2*(threadIdx.x % 4))) & 0x10);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + 2*(threadIdx.x % 4) + 0] = d * (ls1 - 16);
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + 2*(threadIdx.x % 4) + 1] = d * (ls2 - 16);
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + 2*(threadIdx.x % 4) + 0] = d * (ls1 - 16);
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + 2*(threadIdx.x % 4) + 1] = d * (ls2 - 16);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_KS> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_ks<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_KS);
|
||||
@@ -0,0 +1,85 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_kt(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
int i = i0 + threadIdx.y;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq2_kt * bxi = (const block_iq2_kt *)(x + i*stride + sizeof(float)) + kbx0;
|
||||
|
||||
int ib32 = kqsx/4;
|
||||
int j = kqsx%4;
|
||||
const auto ql = (const uint16_t *)bxi->ql;
|
||||
uint32_t val = ql[4*ib32+j] + 4096;
|
||||
int2 v = {0, 0};
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val *= ka;
|
||||
v.x |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val *= ka;
|
||||
v.y |= (ggml_cuda_dp4a(val & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 1] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 1] = v.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
||||
int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const float * dptr = (const float *)(x + i*stride);
|
||||
const float d = dptr[0] * 1.05f;
|
||||
const block_iq2_kt * bxi = (const block_iq2_kt *)(dptr + 1) + kbx0;
|
||||
int ib32 = threadIdx.x % 8;
|
||||
const int ls = iq4k_values[(bxi->scales[ib32%4] >> 4*(ib32/4)) & 0xf];
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * ls;
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + threadIdx.x % 8] = d * ls;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ2_KT> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq2_kt<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_KT);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
|
||||
164
ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_k_id.cu
Normal file
164
ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_k_id.cu
Normal file
@@ -0,0 +1,164 @@
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_k(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
constexpr int qstep = 8;
|
||||
const int kqsx = threadIdx.x % qstep;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/qstep) {
|
||||
int i = i0 + threadIdx.y*(WARP_SIZE/qstep) + threadIdx.x/qstep;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq3_k * bxi = (const block_iq3_k *)(x + i*stride) + kbx0;
|
||||
|
||||
const float d = bxi->d;
|
||||
|
||||
uint16_t extra = bxi->extra >> (kqsx/4);
|
||||
uint32_t extra32[2] = { uint32_t(extra & 0xff) * 0x01010101, uint32_t(extra >> 8) * 0x01010101 };
|
||||
int qh = get_int_b2(bxi->qh, kqsx);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/4; ++l) {
|
||||
|
||||
//extra << 3, extra << 1, extra >> 1, extra >> 3
|
||||
const int ql = get_int_b2(bxi->qs, kqsx + qstep*l);
|
||||
uint32_t val1 = ((ql >> 0) & 0x33333333) | ((extra32[l] << 3) & 0x88888888)
|
||||
| ((qh << 2) & 0x04040404) | ((qh << 4) & 0x40404040);
|
||||
uint32_t val2 = ((ql >> 2) & 0x33333333) | ((extra32[l] << 1) & 0x88888888)
|
||||
| ((qh << 1) & 0x04040404) | ((qh << 3) & 0x40404040);
|
||||
int2 v1 = get_int_from_table_16(val1, iq3nl_values);
|
||||
int2 v2 = get_int_from_table_16(val2, iq3nl_values);
|
||||
|
||||
qh >>= 4;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 0] = v1.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 8] = v2.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 16] = v1.y;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 32*l + 24] = v2.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 0] = v1.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 8] = v2.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 16] = v1.y;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 24] = v2.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
uint16_t sh = bxi->scales_h >> 2*kqsx;
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = d * ((2*(bxi->scales_l[kqsx] & 0xf) + 1) * (sh & 1 ? -1 : 1));
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = d * ((2*(bxi->scales_l[kqsx] >> 4) + 1) * (sh & 2 ? -1 : 1));
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = d * ((2*(bxi->scales_l[kqsx] & 0xf) + 1) * (sh & 1 ? -1 : 1));
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = d * ((2*(bxi->scales_l[kqsx] >> 4) + 1) * (sh & 2 ? -1 : 1));
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_k_r4(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x/4; // 0...7 -> block of 32
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
int i4 = i/4;
|
||||
int ir = i%4;
|
||||
|
||||
const block_iq3_k_r4 * bxi = (const block_iq3_k_r4 *)(x + 4*i4*stride) + kbx0;
|
||||
|
||||
const float d = __half2float(bxi->d[ir]);
|
||||
|
||||
int qh = get_int_b4(bxi->qh, 4*kqsx+ir);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
|
||||
//auto values_l = iq3k_table + (((bxi->extra[ir+4*l] >> kqsx) & 1) << 6);
|
||||
uint32_t extra32 = uint32_t((bxi->extra[ir+4*l] >> kqsx) & 1) * 0x88888888;
|
||||
|
||||
const int ql = get_int_b4(bxi->qs, 8*kqsx + ir + 4*l);
|
||||
uint32_t val1 = ((ql >> 0) & 0x33333333) | extra32 | ((qh << 2) & 0x04040404) | ((qh << 4) & 0x40404040);
|
||||
uint32_t val2 = ((ql >> 2) & 0x33333333) | extra32 | ((qh << 1) & 0x04040404) | ((qh << 3) & 0x40404040);
|
||||
int2 v1 = get_int_from_table_16(val1, iq3nl_values);
|
||||
int2 v2 = get_int_from_table_16(val2, iq3nl_values);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 0] = v1.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 1] = v2.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 2] = v1.y;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 3] = v2.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 0] = v1.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 1] = v2.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 2] = v1.y;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 3] = v2.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
qh >>= 4;
|
||||
}
|
||||
|
||||
int is = 8*kqsx + ir;
|
||||
float dl1 = d * (2*((bxi->scales_l[is%32] >> 4*(is/32)) & 0xf) + 1) * ((bxi->scales_h[is%8] >> (is/8)) & 1 ? -1 : 1);
|
||||
is += 4;
|
||||
float dl2 = d * (2*((bxi->scales_l[is%32] >> 4*(is/32)) & 0xf) + 1) * ((bxi->scales_h[is%8] >> (is/8)) & 1 ? -1 : 1);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = dl1;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = dl2;
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = dl1;
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = dl2;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ3_K> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_k<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ3_K_R4> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_k_r4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ3_K);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ3_K_R4);
|
||||
@@ -0,0 +1,79 @@
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_ks(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
constexpr int qstep = 8;
|
||||
const int kqsx = threadIdx.x % qstep;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/qstep) {
|
||||
int i = i0 + threadIdx.y*(WARP_SIZE/qstep) + threadIdx.x/qstep;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const half * dptr = (const half *)(x + i*stride);
|
||||
const float d = __half2float(dptr[0]);
|
||||
const block_iq3_ks * bxi = (const block_iq3_ks *)(dptr + 1) + kbx0;
|
||||
|
||||
//uint16_t extra = bxi->extra >> 8;
|
||||
int qh = get_int_b2(bxi->qh, kqsx);
|
||||
|
||||
uint32_t extra32 = uint32_t(bxi->extra >> 8) * 0x01010101;
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/4; ++l) {
|
||||
|
||||
const int ql = get_int_b2(bxi->qs, kqsx + qstep*l);
|
||||
uint32_t val1 = ((ql >> 0) & 0x33333333) | ((qh << 2) & 0x04040404) | ((extra32 << 3) & 0x08080808)
|
||||
| ((qh << 4) & 0x40404040) | ((extra32 << 5) & 0x80808080);
|
||||
uint32_t val2 = ((ql >> 2) & 0x33333333) | ((qh << 1) & 0x04040404) | ((extra32 << 2) & 0x08080808)
|
||||
| ((qh << 3) & 0x40404040) | ((extra32 << 4) & 0x80808080);
|
||||
int2 v1 = get_int_from_table_16(val1, iq3nl_values);
|
||||
int2 v2 = get_int_from_table_16(val2, iq3nl_values);
|
||||
|
||||
extra32 >>= 4;
|
||||
qh >>= 4;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx + 32*l + 0] = v1.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx + 32*l + 8] = v2.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx + 32*l + 16] = v1.y;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx + 32*l + 24] = v2.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 0] = v1.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 8] = v2.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 16] = v1.y;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 32*l + 24] = v2.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * (int(((bxi->scales[kqsx%4] >> 4*(kqsx/4)) & 0xf) | (((bxi->extra >> kqsx) & 1) << 4)) - 16);
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = d * (int(((bxi->scales[kqsx%4] >> 4*(kqsx/4)) & 0xf) | (((bxi->extra >> kqsx) & 1) << 4)) - 16);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ3_KS> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_ks<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ3_KS);
|
||||
@@ -0,0 +1,91 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_kt(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
int i = i0 + threadIdx.y;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq3_kt * bxi = (const block_iq3_kt *)(x + i*stride + sizeof(float)) + kbx0;
|
||||
|
||||
int ib32 = kqsx/4;
|
||||
int j = kqsx%4;
|
||||
const auto ql = (const uint16_t *)bxi->ql;
|
||||
const auto qh = (const uint32_t *)bxi->qh;
|
||||
uint32_t mask = 0x01010101 << ib32;
|
||||
uint32_t val = ql[4*ib32+j] + 4096;
|
||||
int2 v = {0, 0};
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val *= ka;
|
||||
v.x |= std::abs(ggml_cuda_dp4a(val & km, 0x01010101, -126)) << 8*k;
|
||||
}
|
||||
auto signs = __vcmpne4(qh[2*j+0] & mask, 0);
|
||||
v.x = __vsub4(v.x ^ signs, signs);
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val *= ka;
|
||||
v.y |= std::abs(ggml_cuda_dp4a(val & km, 0x01010101, -126)) << 8*k;
|
||||
}
|
||||
signs = __vcmpne4(qh[2*j+1] & mask, 0);
|
||||
v.y = __vsub4(v.y ^ signs, signs);
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 1] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 1] = v.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
||||
int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const float * dptr = (const float *)(x + i*stride);
|
||||
const float d = dptr[0] * 1.01f;
|
||||
const block_iq3_kt * bxi = (const block_iq3_kt *)(dptr + 1) + kbx0;
|
||||
int ib32 = threadIdx.x % 8;
|
||||
const int ls = (bxi->scales[ib32%4] >> 4*(ib32/4)) & 0xf;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = d * ls;
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + threadIdx.x % 8] = d * ls;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ3_KT> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq3_kt<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ3_KT);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
|
||||
162
ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_k_id.cu
Normal file
162
ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_k_id.cu
Normal file
@@ -0,0 +1,162 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_k(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
constexpr int qstep = 8;
|
||||
const int kqsx = threadIdx.x % qstep;
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/qstep) {
|
||||
int i = i0 + threadIdx.y*(WARP_SIZE/qstep) + threadIdx.x/qstep;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq4_k * bxi = (const block_iq4_k *)(x + i*stride) + kbx0;
|
||||
const uint16_t extra = bxi->extra >> 2*kqsx;
|
||||
|
||||
auto values_l = iq4k_table + ((extra & 1) << 8);
|
||||
auto values_h = iq4k_table + ((extra & 2) << 7);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/2; ++l) {
|
||||
|
||||
const int q4 = get_int_b4(bxi->qs, (qstep/2)*kqsx + l);
|
||||
|
||||
aux32[0] = (q4 >> 0) & 0x0f0f0f0f;
|
||||
aux32[1] = (q4 >> 4) & 0x0f0f0f0f;
|
||||
|
||||
int val0 = int_from_table_x(aux8+0, values_l);
|
||||
int val1 = int_from_table_x(aux8+4, values_h);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + l + 0] = val0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + l + 4] = val1;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + l + 0] = val0;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + l + 4] = val1;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
const uint8_t sh = bxi->scales_h[kqsx/2] >> 4*(kqsx%2);
|
||||
const int ls1 = ((bxi->scales_l[kqsx] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
||||
const int ls2 = ((bxi->scales_l[kqsx] >> 4) | ((sh << 2) & 0x30)) - 32;
|
||||
|
||||
const float d = bxi->d;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = d * ls1;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = d * ls2;
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = d * ls1;
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = d * ls2;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_k_r4(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x/4; // 0...7 -> block of 32
|
||||
|
||||
uint32_t aux32[4];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
int i4 = i/4;
|
||||
int ir = i%4;
|
||||
|
||||
const block_iq4_k_r4 * bxi = (const block_iq4_k_r4 *)(x + 4*i4*stride) + kbx0;
|
||||
|
||||
const float d = __half2float(bxi->d[ir]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
|
||||
auto values_l = iq4k_table + ((bxi->extra[ir+4*l] << (8 - kqsx)) & 0x100);
|
||||
|
||||
const int ql1 = get_int_b4(bxi->qs, 16*kqsx + ir + 4*l + 0);
|
||||
const int ql2 = get_int_b4(bxi->qs, 16*kqsx + ir + 4*l + 8);
|
||||
aux32[0] = (ql1 >> 0) & 0x0f0f0f0f;
|
||||
aux32[1] = (ql1 >> 4) & 0x0f0f0f0f;
|
||||
aux32[2] = (ql2 >> 0) & 0x0f0f0f0f;
|
||||
aux32[3] = (ql2 >> 4) & 0x0f0f0f0f;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 0] = int_from_table_x(aux8+ 0, values_l);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 2] = int_from_table_x(aux8+ 4, values_l);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 1] = int_from_table_x(aux8+ 8, values_l);
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 3] = int_from_table_x(aux8+12, values_l);
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 0] = int_from_table_x(aux8+ 0, values_l);
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 2] = int_from_table_x(aux8+ 4, values_l);
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 1] = int_from_table_x(aux8+ 8, values_l);
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 3] = int_from_table_x(aux8+12, values_l);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
}
|
||||
|
||||
int is = 8*kqsx + ir;
|
||||
float dl1 = d * ((((bxi->scales_l[is%32] >> 4*(is/32)) & 0xf) | (((bxi->scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32);
|
||||
is += 4;
|
||||
float dl2 = d * ((((bxi->scales_l[is%32] >> 4*(is/32)) & 0xf) | (((bxi->scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = dl1;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = dl2;
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = dl1;
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = dl2;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_K> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_k<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_K_R4> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_k_r4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_K);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_K_R4);
|
||||
187
ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_ks_id.cu
Normal file
187
ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_ks_id.cu
Normal file
@@ -0,0 +1,187 @@
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_kss(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x / 4;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const float * dptr = (const float *)(x + i*stride);
|
||||
const block_iq4_kss * bxi = (const block_iq4_kss *)(dptr + 1) + kbx0;
|
||||
const uint32_t * q4 = bxi->qs + 4*kqsx;
|
||||
uint32_t s32 = (q4[0] & 0x00010001) | ((q4[1] & 0x00010001) << 2) | ((q4[2] & 0x00010001) << 4) | ((q4[3] & 0x00010001) << 6);
|
||||
uint8_t ls = (s32 | (s32 >> 15)) & 0xff;
|
||||
|
||||
auto values = iq4k_values + ((ls & 1) << 4);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
uint32_t val = q4[j] & 0xfffefffe;
|
||||
val = val ^ (val >> 1);
|
||||
auto v = get_int_from_table_16(val, values);
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + j + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + j + 4] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + j + 0] = v.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + j + 4] = v.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = dptr[0] * ((ls & 254) - 127);
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = dptr[0] * ((ls & 254) - 127);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_ks(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x / 4;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const float * dptr = (const float *)(x + i*stride);
|
||||
const block_iq4_ks * bxi = (const block_iq4_ks *)(dptr + 1) + kbx0;
|
||||
const int ls = (bxi->scales[kqsx] & 254) - 127;
|
||||
|
||||
auto values = iq4k_values + ((bxi->scales[kqsx] & 1) << 4);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
const int q4 = get_int_b4(bxi->qs, 4*kqsx+j);
|
||||
const int2 v = get_int_from_table_16(q4, values);
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + j + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + j + 4] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + j + 0] = v.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + j + 4] = v.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = dptr[0] * ls;
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = dptr[0] * ls;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_ks_r4(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_KS_R4, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x/4;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
int i4 = i/4;
|
||||
int ir = i%4;
|
||||
|
||||
const float * dptr = (const float *)(x + 4*i4*stride);
|
||||
const block_iq4_ks_r4 * bxi = (const block_iq4_ks_r4 *)(dptr + 4) + kbx0;
|
||||
|
||||
const int ls = (bxi->scales[4*kqsx + ir] & 254) - 127;
|
||||
auto values = iq4k_values + ((bxi->scales[4*kqsx+ir] & 1) << 4);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
const int q4 = get_int_b4(bxi->qs, 16*kqsx+4*j+ir);
|
||||
const int2 v = get_int_from_table_16(q4, values);
|
||||
const int k0 = 8*kqsx + 4*(j%2) + j/2;
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 2] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + k0 + 0] = v.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + k0 + 2] = v.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = dptr[ir] * ls;
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = dptr[ir] * ls;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_KSS> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_kss<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_KS> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_ks<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_KS_R4> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_ks_r4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_KSS);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_KS);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_KS_R4);
|
||||
|
||||
@@ -0,0 +1,86 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_kt(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
constexpr uint32_t ka = 0xCBAC1FED;
|
||||
constexpr uint32_t km = 0x3f3f3f3f;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ4_XS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
int i = i0 + threadIdx.y;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq4_kt * bxi = (const block_iq4_kt *)(x + i*stride + sizeof(float)) + kbx0;
|
||||
|
||||
int ib32 = kqsx/4;
|
||||
int j = kqsx%4;
|
||||
const auto shb = bxi->qs;
|
||||
const auto ql = (const uint8_t *)(shb + 8);
|
||||
const auto qh = ql + 64;
|
||||
const uint32_t sh = shb[ib32] >> (8 + 6*j);
|
||||
uint32_t offset = 4096 + ((shb[ib32] & 1) << 15);
|
||||
uint32_t val1 = offset + ql[8*ib32+2*j+0] + ((qh[8*(ib32%4)+2*j+0] << (8 - 4*(ib32/4))) & 0xf00) + ((sh & 7) << 12);
|
||||
uint32_t val2 = offset + ql[8*ib32+2*j+1] + ((qh[8*(ib32%4)+2*j+1] << (8 - 4*(ib32/4))) & 0xf00) + ((sh & 56) << 9);
|
||||
int2 v = {0, 0};
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
val1 *= ka;
|
||||
val2 *= ka;
|
||||
v.x |= (ggml_cuda_dp4a(val1 & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
v.y |= (ggml_cuda_dp4a(val2 & km, 0x01010101, -126) & 0xff) << 8*k;
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*ib32 + 2*j + 1] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 0] = v.x;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*ib32 + 2*j + 1] = v.y;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
||||
int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const float * dptr = (const float *)(x + i*stride);
|
||||
const block_iq4_kt * bxi = (const block_iq4_kt *)(dptr + 1) + kbx0;
|
||||
const int ls = (bxi->qs[threadIdx.x % 8] & 0xff) >> 1;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + threadIdx.x % 8] = dptr[0] * (ls - 64);
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + threadIdx.x % 8] = dptr[0] * (ls - 64);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_KT> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq4_kt<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_KT);
|
||||
@@ -0,0 +1,4 @@
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
|
||||
DECL_MMQ_CASE(GGML_TYPE_MXFP4);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
|
||||
174
ggml/src/ggml-cuda/template-instances/mmq-instance-iq5_k_id.cu
Normal file
174
ggml/src/ggml-cuda/template-instances/mmq-instance-iq5_k_id.cu
Normal file
@@ -0,0 +1,174 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq5_k(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
constexpr int qstep = 8;
|
||||
const int kqsx = threadIdx.x % qstep;
|
||||
|
||||
auto values = iq5nl_values;
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/qstep) {
|
||||
int i = i0 + threadIdx.y*(WARP_SIZE/qstep) + threadIdx.x/qstep;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq5_k * bxi = (const block_iq5_k *)(x + i*stride) + kbx0;
|
||||
|
||||
int qh = get_int_b4(bxi->qh, kqsx);
|
||||
uint16_t extra = bxi->extra >> (kqsx/4);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/2; ++l) {
|
||||
|
||||
const int ql = get_int_b4(bxi->qs, kqsx + qstep*l);
|
||||
aux32[0] = ((ql >> 0) & 0x0f0f0f0f) | ((qh & 0x01010101) << 4) | ((extra & 1) * 0x20202020); // this is very slightly faster
|
||||
aux32[1] = ((ql >> 4) & 0x0f0f0f0f) | ((qh & 0x02020202) << 3) | ((extra & 4) * 0x08080808); // then the version below
|
||||
//aux32[0] = ((ql >> 0) & 0x0f0f0f0f) | ((qh & 0x01010101) << 4) | ((extra & 1) ? 0x20202020 : 0);
|
||||
//aux32[1] = ((ql >> 4) & 0x0f0f0f0f) | ((qh & 0x02020202) << 3) | ((extra & 4) ? 0x20202020 : 0);
|
||||
qh >>= 2;
|
||||
extra >>= 4;
|
||||
|
||||
const char4 val0 = make_char4(values[aux8[0]], values[aux8[1]], values[aux8[2]], values[aux8[3]]);
|
||||
const char4 val1 = make_char4(values[aux8[4]], values[aux8[5]], values[aux8[6]], values[aux8[7]]);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 16*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 16*l + 8] = *(const int *)&val1;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 16*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 16*l + 8] = *(const int *)&val1;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
const uint8_t sh = bxi->scales_h[kqsx/2] >> 4*(kqsx%2);
|
||||
const int ls1 = ((bxi->scales_l[kqsx] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
||||
const int ls2 = ((bxi->scales_l[kqsx] >> 4) | ((sh << 2) & 0x30)) - 32;
|
||||
|
||||
const float d = bxi->d;
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = d * ls1;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = d * ls2;
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = d * ls1;
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = d * ls2;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq5_k_r4(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x/4; // 0...7 -> block of 32
|
||||
|
||||
uint32_t aux32[4];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
int i4 = i/4;
|
||||
int ir = i%4;
|
||||
|
||||
const block_iq5_k_r4 * bxi = (const block_iq5_k_r4 *)(x + 4*i4*stride) + kbx0;
|
||||
|
||||
const float d = __half2float(bxi->d[ir]);
|
||||
|
||||
int qh = get_int_b4(bxi->qh, 4*kqsx + ir);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
|
||||
auto values_l = iq5nl_values + (((bxi->extra[ir+4*l] >> kqsx) & 1) << 5);
|
||||
|
||||
const int ql1 = get_int_b4(bxi->qs, 16*kqsx + ir + 4*l + 0);
|
||||
const int ql2 = get_int_b4(bxi->qs, 16*kqsx + ir + 4*l + 8);
|
||||
aux32[0] = ((ql1 >> 0) & 0x0f0f0f0f) | ((qh << 4) & 0x10101010);
|
||||
aux32[1] = ((ql1 >> 4) & 0x0f0f0f0f) | ((qh << 3) & 0x10101010);
|
||||
aux32[2] = ((ql2 >> 0) & 0x0f0f0f0f) | ((qh >> 0) & 0x10101010);
|
||||
aux32[3] = ((ql2 >> 4) & 0x0f0f0f0f) | ((qh >> 1) & 0x10101010);
|
||||
|
||||
const char4 val0 = make_char4(values_l[aux8[ 0]], values_l[aux8[ 1]], values_l[aux8[ 2]], values_l[aux8[ 3]]);
|
||||
const char4 val1 = make_char4(values_l[aux8[ 4]], values_l[aux8[ 5]], values_l[aux8[ 6]], values_l[aux8[ 7]]);
|
||||
const char4 val2 = make_char4(values_l[aux8[ 8]], values_l[aux8[ 9]], values_l[aux8[10]], values_l[aux8[11]]);
|
||||
const char4 val3 = make_char4(values_l[aux8[12]], values_l[aux8[13]], values_l[aux8[14]], values_l[aux8[15]]);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 2] = *(const int *)&val1;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 1] = *(const int *)&val2;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + 4*l + 3] = *(const int *)&val3;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 2] = *(const int *)&val1;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 1] = *(const int *)&val2;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + 8*kqsx + 4*l + 3] = *(const int *)&val3;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
qh >>= 2;
|
||||
}
|
||||
|
||||
int is = 8*kqsx + ir;
|
||||
float dl1 = d * ((((bxi->scales_l[is%32] >> 4*(is/32)) & 0xf) | (((bxi->scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32);
|
||||
is += 4;
|
||||
float dl2 = d * ((((bxi->scales_l[is%32] >> 4*(is/32)) & 0xf) | (((bxi->scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = dl1;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = dl2;
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = dl1;
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = dl2;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ5_K> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq5_k<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ5_K_R4> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq5_k_r4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ5_K);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ5_K_R4);
|
||||
146
ggml/src/ggml-cuda/template-instances/mmq-instance-iq5_ks_id.cu
Normal file
146
ggml/src/ggml-cuda/template-instances/mmq-instance-iq5_ks_id.cu
Normal file
@@ -0,0 +1,146 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq5_ks(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ5_KS, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
constexpr int qstep = 8;
|
||||
const int kqsx = threadIdx.x % qstep;
|
||||
|
||||
auto values = iq5nl_values;
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/qstep) {
|
||||
int i = i0 + threadIdx.y*(WARP_SIZE/qstep) + threadIdx.x/qstep;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const float * dptr = (const float *)(x + i*stride);
|
||||
const float d = dptr[0];
|
||||
const block_iq5_ks * bxi = (const block_iq5_ks *)(dptr + 1) + kbx0;
|
||||
|
||||
int qh = get_int_b4(bxi->qh, kqsx);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/2; ++l) {
|
||||
|
||||
const int ql = get_int_b4(bxi->qs, kqsx + qstep*l);
|
||||
aux32[0] = ((ql >> 0) & 0x0f0f0f0f) | ((qh & 0x01010101) << 4) | ((bxi->scales[2*l+0] & 1) * 0x20202020);
|
||||
aux32[1] = ((ql >> 4) & 0x0f0f0f0f) | ((qh & 0x02020202) << 3) | ((bxi->scales[2*l+1] & 1) * 0x20202020);
|
||||
qh >>= 2;
|
||||
|
||||
const char4 val0 = make_char4(values[aux8[0]], values[aux8[1]], values[aux8[2]], values[aux8[3]]);
|
||||
const char4 val1 = make_char4(values[aux8[4]], values[aux8[5]], values[aux8[6]], values[aux8[7]]);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx + 16*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx + 16*l + 8] = *(const int *)&val1;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 16*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 16*l + 8] = *(const int *)&val1;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * ((bxi->scales[kqsx] & 254) - 127);
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + kqsx] = d * ((bxi->scales[kqsx] & 254) - 127);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq5_ks_r4(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ5_KS_R4, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
const int kqsx = threadIdx.x/4;
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += 4*nwarps) {
|
||||
int i = i0 + 4*threadIdx.y + threadIdx.x%4;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
int i4 = i/4;
|
||||
int ir = i%4;
|
||||
|
||||
const float * dptr = (const float *)(x + 4*i4*stride);
|
||||
const block_iq5_ks_r4 * bxi = (const block_iq5_ks_r4 *)(dptr + 4) + kbx0;
|
||||
|
||||
const int ls = (bxi->scales[4*kqsx + ir] & 254) - 127;
|
||||
auto values = iq5nl_values + ((bxi->scales[4*kqsx+ir] & 1) << 5);
|
||||
|
||||
int qh = *((const int *)bxi->qh + 4*kqsx + ir);
|
||||
const int * ql = (const int *)bxi->qs + 16*kqsx + ir;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
aux32[0] = ((ql[4*j] >> 0) & 0x0f0f0f0f) | ((qh << 4) & 0x10101010);
|
||||
aux32[1] = ((ql[4*j] >> 4) & 0x0f0f0f0f) | ((qh << 3) & 0x10101010);
|
||||
qh >>= 2;
|
||||
const char4 val0 = make_char4(values[aux8[0]], values[aux8[1]], values[aux8[2]], values[aux8[3]]);
|
||||
const char4 val1 = make_char4(values[aux8[4]], values[aux8[5]], values[aux8[6]], values[aux8[7]]);
|
||||
const int k0 = 8*kqsx + 4*(j%2) + j/2;
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = *(const int *)&val0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 2] = *(const int *)&val1;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + k0 + 0] = *(const int *)&val0;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + k0 + 2] = *(const int *)&val1;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = dptr[ir] * ls;
|
||||
#else
|
||||
x_df[i*(WARP_SIZE/4) + i/4 + kqsx] = dptr[ir] * ls;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ5_KS> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq5_ks<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ5_KS_R4> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq5_ks_r4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ5_KS);
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ5_KS_R4);
|
||||
@@ -0,0 +1,80 @@
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq6_k(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + WARP_SIZE*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
|
||||
constexpr int qstep = 8;
|
||||
const int kqsx = threadIdx.x % qstep;
|
||||
|
||||
auto values = iq6nl_values;
|
||||
int qh[2];
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/qstep) {
|
||||
int i = i0 + threadIdx.y*(WARP_SIZE/qstep) + threadIdx.x/qstep;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_iq6_k * bxi = (const block_iq6_k *)(x + i*stride) + kbx0;
|
||||
|
||||
const float d = bxi->d;
|
||||
uint16_t extra = bxi->extra >> (kqsx/4);
|
||||
|
||||
qh[0] = get_int_b4(bxi->qh, kqsx+0);
|
||||
qh[1] = get_int_b4(bxi->qh, kqsx+8);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < qstep/2; ++l) {
|
||||
|
||||
const int ql = get_int_b4(bxi->qs, kqsx + qstep*l);
|
||||
aux32[0] = ((ql >> 0) & 0x0f0f0f0f) | ((qh[l/2] & 0x03030303) << 4) | ((extra & 1) * 0x40404040);
|
||||
aux32[1] = ((ql >> 4) & 0x0f0f0f0f) | ((qh[l/2] & 0x0c0c0c0c) << 2) | ((extra & 4) * 0x10101010);
|
||||
qh[l/2] >>= 4;
|
||||
extra >>= 4;
|
||||
|
||||
const char4 val0 = make_char4(values[aux8[0]], values[aux8[1]], values[aux8[2]], values[aux8[3]]);
|
||||
const char4 val1 = make_char4(values[aux8[4]], values[aux8[5]], values[aux8[6]], values[aux8[7]]);
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 16*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + kqsx + 16*l + 8] = *(const int *)&val1;
|
||||
#else
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 16*l + 0] = *(const int *)&val0;
|
||||
x_qs[i*(2*WARP_SIZE + 1) + kqsx + 16*l + 8] = *(const int *)&val1;
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+0] = d * bxi->scales[2*kqsx+0];
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + 2*kqsx+1] = d * bxi->scales[2*kqsx+1];
|
||||
#else
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = d * bxi->scales[2*kqsx+0];
|
||||
x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = d * bxi->scales[2*kqsx+1];
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits_id<mmq_x, mmq_y, need_check, GGML_TYPE_IQ6_K> {
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_iq6_k<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_IQ6_K);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q2_K);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q3_K);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q4_0);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q4_1);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q4_K);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q5_0);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q5_1);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q5_K);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q6_0);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q6_K);
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq_id_common.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q8_0);
|
||||
4
ggml/src/ggml-cuda/vendors/cuda.h
vendored
4
ggml/src/ggml-cuda/vendors/cuda.h
vendored
@@ -6,6 +6,10 @@
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
#if CUDART_VERSION >= 12050
|
||||
#include <cuda_fp8.h>
|
||||
#endif // CUDART_VERSION >= 12050
|
||||
|
||||
#if CUDART_VERSION < 11020
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||
|
||||
Reference in New Issue
Block a user