WIP - not working

This commit is contained in:
Kawrakow
2026-01-11 12:52:37 +02:00
parent 6ba5772b07
commit 77bd2effa4

View File

@@ -2712,22 +2712,24 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
CUDA_CHECK(cudaStreamSynchronize(stream));
ggml_tensor src0_1_row = *src0_1;
ggml_tensor src0_2_row = *src0_2;
ggml_tensor src0_2_row; if (src0_2) src0_2_row = *src0_2;
ggml_tensor src1_row = *src1;
ggml_tensor final_dst;
ggml_tensor final_src;
char * src0_1_original = (char *) src0_1->data;
char * src0_2_original = (char *) src0_2->data;
char * src0_2_original = src0_2 ? (char *) src0_2->data : nullptr;
char * src1_original = (char *) src1->data;
char * dst_original = (char *) dst->data;
src0_1_row.ne[2] = 1;
src0_1_row.ne[3] = 1;
src0_1_row.nb[3] = nb02;
src0_2_row.ne[2] = 1;
src0_2_row.ne[3] = 1;
src0_2_row.nb[3] = nb02;
if (src0_2) {
src0_2_row.ne[2] = 1;
src0_2_row.ne[3] = 1;
src0_2_row.nb[3] = nb02;
}
src1_row.ne[1] = 1;
src1_row.ne[2] = 1;
@@ -2755,7 +2757,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
ggml_cuda_pool_alloc<char> src1_quantized(ctx.pool());
bool use_quantized_src1 = false;
int64_t src1_padded_num_cols = 0, src1_padded_row_size = 0, src1_quantized_size = 0;
if (ggml_is_quantized(src0_1->type) && src0_1->type == src0_2->type && src1->ne[1] == 1 && src1->ne[3] == 1) {
if (ggml_is_quantized(src0_1->type) && (!src0_2 || src0_1->type == src0_2->type) && src1->ne[1] == 1 && src1->ne[3] == 1) {
if (ggml_cuda_should_use_mmq(src0_1->type, ggml_cuda_info().devices[ctx.device].cc, src1->ne[2])) {
src1_padded_num_cols = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
src1_padded_row_size = src1_padded_num_cols/ggml_blck_size(GGML_TYPE_Q8_1)*ggml_type_size(GGML_TYPE_Q8_1);
@@ -2768,8 +2770,14 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
if (!use_quantized_src1) {
src1_contiguous.alloc(sizeof(float)*ggml_nelements(src1));
}
ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool()), dst_gate_contiguous(ctx.pool());
if (src0_2) {
dst_up_contiguous.alloc(sizeof(float)*ggml_nelements(dst));
dst_gate_contiguous.alloc(sizeof(float)*ggml_nelements(dst));
} else {
dst_up_contiguous.alloc(2*sizeof(float)*ggml_nelements(dst));
dst_gate_contiguous.alloc(sizeof(float)*ggml_nelements(dst));
}
ggml_cuda_pool_alloc<char> final_dst_contiguous(ctx.pool());
if (fuse_down) {
final_dst.data = final_dst_contiguous.alloc(ggml_nelements(next));
@@ -2812,20 +2820,26 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
}
src0_1_row.data = src0_1_original + i02*nb02;
src0_2_row.data = src0_2_original + i02*nb02;
if (src0_2_original) src0_2_row.data = src0_2_original + i02*nb02;
GGML_ASSERT(nb11 == sizeof(float)*ne10);
GGML_ASSERT(nb1 == sizeof(float)*ne0);
auto nb1l = nb1;
if (!src0_2) {
nb1l *= 2;
dst_row.ne[0] *= 2;
}
src1_row.ne[1] = num_src1_rows;
src1_row.nb[1] = use_quantized_src1 ? src1_padded_row_size : nb11;
src1_row.nb[2] = num_src1_rows*src1_row.nb[1];
src1_row.nb[3] = num_src1_rows*src1_row.nb[1];
dst_row.ne[1] = num_src1_rows;
dst_row.nb[1] = nb1;
dst_row.nb[2] = num_src1_rows*nb1;
dst_row.nb[3] = num_src1_rows*nb1;
dst_row.nb[1] = nb1l;
dst_row.nb[2] = num_src1_rows*nb1l;
dst_row.nb[3] = num_src1_rows*nb1l;
dst_row.data = dst_up_contiguous.get();
if (use_quantized_src1) {
@@ -2843,6 +2857,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
CUDA_CHECK(cudaGetLastError());
}
if (src0_2) {
dst_row.data = dst_gate_contiguous.get();
if (use_quantized_src1) {
ggml_cuda_mul_mat_q_id(ctx, &src0_2_row, &src1_row, nullptr, &dst_row, nullptr, src1_quantized.get());
@@ -2858,8 +2873,10 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
(const float *)((const char *)dst->src[5]->data + i02*dst->src[5]->nb[1]), (float *)dst_row.data);
CUDA_CHECK(cudaGetLastError());
}
}
auto unary_op = (ggml_unary_op)dst->op_params[0];
if (src0_2) {
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
ggml_swiglu_oai_cuda_f32((const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
(float *)dst_gate_contiguous.get(), ggml_nelements(&dst_row), dst_row.ne[0], dst_row.ne[0], dst_row.ne[0],
@@ -2869,6 +2886,17 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
(float *)dst_gate_contiguous.get());
}
} else {
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
ggml_swiglu_oai_cuda_f32((const float *)dst_up_contiguous.get() + dst->ne[0], (const float *)dst_up_contiguous.get(),
(float *)dst_gate_contiguous.get(), ggml_nelements(dst), dst->ne[0], dst->ne[0], dst->ne[0],
1.702f, 7.0f, stream);
} else {
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
(float *)dst_gate_contiguous.get());
}
}
CUDA_CHECK(cudaGetLastError());
if (fuse_down) {