mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 19:31:48 +00:00
Better gemm strategy when nth > nhead
It gives a ~10% PP performance boost for DeepSeek-Lite with 32 threads (with or without MLA). Before this commit, when nth > nhead heads were processed sequentially with all nth threads participating in each matrix multiplication. Now we ind the gcd of nhead and nth and split threads into nth/gcd groups, each group processing nhead/gcd heads.
This commit is contained in:
@@ -14056,31 +14056,22 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if GGML_USE_IQK_MULMAT
|
#if GGML_USE_IQK_MULMAT
|
||||||
if (dst->type == GGML_TYPE_F32 && (ne12*ne13)%nth == 0) {
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
|
int gcd = simple_gcd(ne12*ne13, nth);
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
if (counter++ % nth == ith) {
|
if ((counter++ % gcd) == (ith%gcd)) {
|
||||||
if (!iqk_mul_mat(ne01, ne11, ne00,
|
if (!iqk_mul_mat(ne01, ne11, ne00,
|
||||||
src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01, ///ggml_type_size(src0->type),
|
src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01, ///ggml_type_size(src0->type),
|
||||||
src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11, ///ggml_type_size(src1->type),
|
src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11, ///ggml_type_size(src1->type),
|
||||||
(float *)((char *)dst->data + i12*nb2 + i13*nb3), nb1/ggml_type_size(dst->type),
|
(float *)((char *)dst->data + i12*nb2 + i13*nb3), nb1/ggml_type_size(dst->type),
|
||||||
0, 1)) goto IQK_MulMat_Not_Available1;
|
ith/gcd, nth/gcd)) goto IQK_MulMat_Not_Available1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
|
||||||
if (!iqk_mul_mat(ne01, ne11, ne00,
|
|
||||||
src0->type, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01, ///ggml_type_size(src0->type),
|
|
||||||
src1->type, (const char *)src1->data + i12*nb12 + i13*nb13, nb11, ///ggml_type_size(src1->type),
|
|
||||||
(float *)((char *)dst->data + i12*nb2 + i13*nb3), nb1/ggml_type_size(dst->type),
|
|
||||||
ith, nth)) goto IQK_MulMat_Not_Available1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
IQK_MulMat_Not_Available1:;
|
IQK_MulMat_Not_Available1:;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user