mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-27 09:53:40 +00:00
Step-3.5-Flash support (#1231)
* WIP * This works but is slow * Turn off the up / gate clamps for now * OK we need the clamping * Fuse the clamp (CUDA) * Fuse the clamp (CPU) * WIP * Be able to use merged q, k, v * Be able to use merged up/gate experts * Fuse the clamp (CUDA mmvq)
This commit is contained in:
@@ -2626,12 +2626,13 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id;
|
||||
|
||||
auto unary_op = (ggml_unary_op)dst->op_params[0];
|
||||
float limit = *(const float *)(dst->op_params + 1);
|
||||
if (src0_2) {
|
||||
ggml_cuda_op_fused_mul_mat_vec_q_id(ctx, src0_1, &local_src1, ids, &local_dst,
|
||||
dst->src[4], dst->src[5],
|
||||
(const char *)src0_1->data, src0_2 ? (const char *)src0_2->data : nullptr,
|
||||
(const float *)src1->data, src1_quantized.get(),
|
||||
(float *)local_dst.data, 0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, stream);
|
||||
(float *)local_dst.data, 0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
|
||||
} else {
|
||||
auto local_src0_1 = *src0_1;
|
||||
local_src0_1.ne[1] /= 2;
|
||||
@@ -2642,7 +2643,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
nullptr, nullptr,
|
||||
(const char *)local_src0_1.data, (const char *)local_src0_2.data,
|
||||
(const float *)src1->data, src1_quantized.get(),
|
||||
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, stream);
|
||||
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
|
||||
} else {
|
||||
GGML_ASSERT(!dst->src[5]);
|
||||
auto local_bias_1 = *dst->src[4];
|
||||
@@ -2653,7 +2654,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
&local_bias_1, &local_bias_2,
|
||||
(const char *)local_src0_1.data, (const char *)local_src0_2.data,
|
||||
(const float *)src1->data, src1_quantized.get(),
|
||||
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, stream);
|
||||
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -2773,9 +2774,11 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
(float *)dst->data, ggml_nelements(dst), dst_row.ne[0], dst_row.ne[0], dst_row.ne[0],
|
||||
1.702f, 7.0f, stream);
|
||||
} else {
|
||||
float limit = *((const float *)(dst->op_params + 1));
|
||||
//printf("%s: using limit = %g\n", __func__, limit);
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst->data);
|
||||
(float *)dst->data, limit);
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -2801,8 +2804,10 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
(float *)dst->data, ggml_nelements(dst), dst->ne[0], src0_1->ne[1], src0_1->ne[1],
|
||||
1.702f, 7.0f, stream);
|
||||
} else {
|
||||
float limit = *((const float *)(dst->op_params + 1));
|
||||
//printf("%s: using limit = %g\n", __func__, limit);
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst), dst->ne[0],
|
||||
(const float *)dst_up_gate_contiguous.get(), (float *)dst->data);
|
||||
(const float *)dst_up_gate_contiguous.get(), (float *)dst->data, limit);
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -2970,6 +2975,8 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
}
|
||||
|
||||
auto unary_op = (ggml_unary_op)dst->op_params[0];
|
||||
float limit = *(const float *)(dst->op_params + 1);
|
||||
//printf("%s: using limit = %g\n", __func__, limit);
|
||||
if (src0_2) {
|
||||
dst_row.data = dst_gate_contiguous.get();
|
||||
if (use_quantized_src1) {
|
||||
@@ -2993,7 +3000,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
} else {
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
|
||||
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
|
||||
(float *)dst_gate_contiguous.get());
|
||||
(float *)dst_gate_contiguous.get(), limit);
|
||||
}
|
||||
} else {
|
||||
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
|
||||
@@ -3002,7 +3009,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
1.702f, 7.0f, stream);
|
||||
} else {
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row)/2, dst->ne[0],
|
||||
(const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get());
|
||||
(const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get(), limit);
|
||||
}
|
||||
dst_row.data = dst_gate_contiguous.get();
|
||||
dst_row.ne[0] /= 2;
|
||||
@@ -3065,6 +3072,8 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
|
||||
auto stream = ctx.stream();
|
||||
|
||||
float limit = *(const float *)(dst->op_params + 1);
|
||||
|
||||
auto ne10_padded = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
|
||||
auto nb10_padded = ne10_padded*sizeof(block_q8_1)/QK8_1;
|
||||
auto quantized_size = nb10_padded*src1->ne[1];
|
||||
@@ -3083,7 +3092,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
dst->src[4], dst->src[5],
|
||||
(const char *)src0_1->data, (const char *)src0_2->data, (const float *)src1->data, src1_quantized.get(),
|
||||
(float *)dst->data, 0, src0_1->ne[1], 1, ne10_padded,
|
||||
(ggml_unary_op)dst->op_params[0], stream);
|
||||
(ggml_unary_op)dst->op_params[0], limit, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -3116,8 +3125,9 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
//printf("%s: using limit = %g\n", __func__, limit);
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
|
||||
(const float *)dst->data, dst_up.get(), (float *)dst->data);
|
||||
(const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
|
||||
const void * __restrict__ vup, const void * __restrict__ vgate, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const float * __restrict__ bias_u, const float * __restrict__ bias_g,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size,
|
||||
ggml_unary_op unary_op) {
|
||||
ggml_unary_op unary_op, float limit) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
@@ -191,7 +191,12 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
|
||||
float g = tmp_g[j][threadIdx.x];
|
||||
float r;
|
||||
switch (unary_op) {
|
||||
case GGML_UNARY_OP_SILU: r = u*g/(1 + expf(-g)); break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
g = g/(1 + expf(-g));
|
||||
g = min(g, limit);
|
||||
r = max(-limit, min(limit, u))*g;
|
||||
} break;
|
||||
case GGML_UNARY_OP_RELU: r = fmaxf(g, 0.0f) * u; break;
|
||||
case GGML_UNARY_OP_GELU: {
|
||||
constexpr float GELU_COEF_A = 0.044715f;
|
||||
@@ -243,7 +248,7 @@ static __global__ void iqk_fused_mul_mat_vec_q(
|
||||
const void * __restrict__ vx_u, const void * __restrict__ vx_g, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const char * __restrict__ ids_data, const void * __restrict__ bias_u, const void * __restrict__ bias_g, const uint64_t bias_nb1,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size,
|
||||
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op) {
|
||||
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op, float limit) {
|
||||
|
||||
int i2 = blockIdx.y;
|
||||
int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
|
||||
@@ -256,7 +261,7 @@ static __global__ void iqk_fused_mul_mat_vec_q(
|
||||
char * cdst = (char *)dst + i2*nb2;
|
||||
iqk_fused_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(
|
||||
cx_u, cx_g, cy, (float *)cdst, cx_u_b, cx_g_b,
|
||||
ncols_x, nrows_x, nrows_y, nrows_dst, row_size, unary_op);
|
||||
ncols_x, nrows_x, nrows_y, nrows_dst, row_size, unary_op, limit);
|
||||
}
|
||||
|
||||
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int n_interleaved = 1>
|
||||
@@ -307,56 +312,56 @@ static void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream)
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
case 2:
|
||||
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 2, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
case 3:
|
||||
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 3, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
case 4:
|
||||
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 4, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
case 5:
|
||||
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 5, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
case 6:
|
||||
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 6, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
case 7:
|
||||
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 7, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
case 8:
|
||||
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 8, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
|
||||
args.vx_u, args.vx_g, args.vy, args.dst,
|
||||
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
|
||||
@@ -22,5 +22,6 @@ struct mmvq_args {
|
||||
const uint64_t ids_nb0;
|
||||
const uint64_t bias_nb1;
|
||||
ggml_unary_op unary_op;
|
||||
float limit;
|
||||
};
|
||||
|
||||
|
||||
@@ -154,7 +154,7 @@ static __device__ void fused_mul_mat_vec_q(
|
||||
const void * __restrict__ vup, const void * __restrict__ vgate,
|
||||
const float * __restrict__ bias_u, const float * __restrict__ bias_g,
|
||||
const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, ggml_unary_op unary_op) {
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, ggml_unary_op unary_op, float limit) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
@@ -243,7 +243,12 @@ static __device__ void fused_mul_mat_vec_q(
|
||||
float g = tmp_g[j][threadIdx.x];
|
||||
float r;
|
||||
switch (unary_op) {
|
||||
case GGML_UNARY_OP_SILU: r = u*g/(1 + expf(-g)); break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
g = g/(1 + expf(-g));
|
||||
g = min(g, limit);
|
||||
r = max(-limit, min(limit, u))*g;
|
||||
}break;
|
||||
case GGML_UNARY_OP_RELU: r = fmaxf(g, 0.0f) * u; break;
|
||||
case GGML_UNARY_OP_GELU: {
|
||||
constexpr float GELU_COEF_A = 0.044715f;
|
||||
@@ -299,7 +304,7 @@ static __global__ void fused_mul_mat_vec_q(
|
||||
const void * __restrict__ vy, float * __restrict__ dst, const char * __restrict__ ids_data,
|
||||
const void * __restrict__ bias_u, const void * __restrict__ bias_g, const uint64_t bias_nb1,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst,
|
||||
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op) {
|
||||
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op, float limit) {
|
||||
|
||||
int i2 = blockIdx.y;
|
||||
char * cdst = (char *)dst + i2*nb2;
|
||||
@@ -312,7 +317,8 @@ static __global__ void fused_mul_mat_vec_q(
|
||||
const float * cx_u_b = bias_u ? (const float *)((const char *)bias_u + i02*bias_nb1) : nullptr;
|
||||
const float * cx_g_b = bias_g ? (const float *)((const char *)bias_g + i02*bias_nb1) : nullptr;
|
||||
const char * cy = (const char *)vy + i2*nb12;
|
||||
fused_mul_mat_vec_q<type, ncols_y, nwarps>(cx_u, cx_g, cx_u_b, cx_g_b, cy, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, unary_op);
|
||||
fused_mul_mat_vec_q<type, ncols_y, nwarps>(cx_u, cx_g, cx_u_b, cx_g_b, cy, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst,
|
||||
unary_op, limit);
|
||||
}
|
||||
|
||||
template <ggml_type type, int nwarps>
|
||||
@@ -335,42 +341,50 @@ static void mul_mat_vec_q_cuda_T(const mmvq_args & args, cudaStream_t stream) {
|
||||
case 1:
|
||||
fused_mul_mat_vec_q<type, 1, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
case 2:
|
||||
fused_mul_mat_vec_q<type, 2, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
case 3:
|
||||
fused_mul_mat_vec_q<type, 3, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
case 4:
|
||||
fused_mul_mat_vec_q<type, 4, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
case 5:
|
||||
fused_mul_mat_vec_q<type, 5, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
case 6:
|
||||
fused_mul_mat_vec_q<type, 6, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
case 7:
|
||||
fused_mul_mat_vec_q<type, 7, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
case 8:
|
||||
fused_mul_mat_vec_q<type, 8, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
|
||||
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
|
||||
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
|
||||
args.unary_op, args.limit);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
@@ -13,7 +13,7 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm
|
||||
const char * src0_dd_u, const char * src0_dd_g, const char * src1_ddq_i, float * dst_dd_i, const char * ids_data,
|
||||
const void * bias_u, const void * bias_g,
|
||||
const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream) {
|
||||
const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream) {
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
@@ -41,7 +41,8 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm
|
||||
/* nb2 */ uint64_t(nb2),
|
||||
/* ids_nb0 */ uint64_t(ids_nb0),
|
||||
/* bias_nb1 */ uint64_t(bias_nb1),
|
||||
/* unary_op */ unary_op
|
||||
/* unary_op */ unary_op,
|
||||
/* limit */ limit > 1e-6f ? limit : INFINITY
|
||||
};
|
||||
|
||||
switch (type) {
|
||||
@@ -163,7 +164,7 @@ void ggml_cuda_op_mul_mat_vec_q_3D(
|
||||
src0->nb[2], src1_row_size, dst->nb[2], 0, 0,
|
||||
src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, nullptr, nullptr, nullptr,
|
||||
row_low, row_high, src1_ncols,
|
||||
src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
|
||||
src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);
|
||||
|
||||
GGML_UNUSED(src1_ddf_i);
|
||||
}
|
||||
@@ -199,7 +200,7 @@ void ggml_cuda_op_mul_mat_vec_q_biased(
|
||||
ne00, ne0, 1, 0, 0, 0, 0, 0,
|
||||
src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, nullptr, bias ? bias->data : nullptr, nullptr,
|
||||
row_low, row_high, src1_ncols,
|
||||
src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
|
||||
src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);
|
||||
|
||||
GGML_UNUSED(src1_ddf_i);
|
||||
}
|
||||
@@ -246,7 +247,7 @@ void ggml_cuda_op_mul_mat_vec_q_id(
|
||||
src0->nb[2], src1->nb[2], dst->nb[2], ids->nb[0], bias ? bias->nb[1] : 0,
|
||||
src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, (const char *)ids->data, bias ? bias->data : nullptr, nullptr,
|
||||
row_low, row_high, src1_ncols,
|
||||
src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
|
||||
src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);
|
||||
|
||||
GGML_UNUSED(src1_ddf_i);
|
||||
}
|
||||
@@ -256,7 +257,7 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * bias_u, const ggml_tensor * bias_g,
|
||||
const char * src0_dd_u, const char * src0_dd_g, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream) {
|
||||
const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream) {
|
||||
|
||||
if (!bias_u && !bias_g) {
|
||||
GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU ||
|
||||
@@ -294,7 +295,7 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
|
||||
src0_dd_u, src0_dd_g, src1_ddq_i, dst_dd_i, ids ? (const char *)ids->data : nullptr,
|
||||
bias_u ? bias_u->data : nullptr, bias_g ? bias_g->data : nullptr,
|
||||
row_low, row_high, src1_ncols,
|
||||
src1_padded_row_size, unary_op, stream);
|
||||
src1_padded_row_size, unary_op, limit, stream);
|
||||
|
||||
GGML_UNUSED(src1_ddf_i);
|
||||
}
|
||||
|
||||
@@ -40,4 +40,4 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * bias_u, const ggml_tensor * bias_g,
|
||||
const char * src0_dd_u, const char * src0_dd_g, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream);
|
||||
const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream);
|
||||
|
||||
@@ -61,6 +61,17 @@ static __global__ void fused_mul_silu_f32(const float * x, const float * y, floa
|
||||
dst[i] = x[i] * y[i] / (1.0f + expf(-x[i]));
|
||||
}
|
||||
|
||||
static __global__ void fused_mul_silu_f32(const float * x, const float * y, float * dst, const int k, float limit) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
float g = x[i] / (1.0f + expf(-x[i]));
|
||||
g = min(g, limit);
|
||||
dst[i] = g * max(-limit, min(limit, y[i]));
|
||||
}
|
||||
|
||||
static __global__ void fused_mul_silu_f32(const float * x, float * dst, const int k, const int ne0) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
@@ -73,6 +84,20 @@ static __global__ void fused_mul_silu_f32(const float * x, float * dst, const in
|
||||
dst[i] = x_row[j] * x_row[j + ne0] / (1.0f + expf(-x_row[j + ne0]));
|
||||
}
|
||||
|
||||
static __global__ void fused_mul_silu_f32(const float * x, float * dst, const int k, const int ne0, float limit) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
int row = i / ne0;
|
||||
int j = i % ne0;
|
||||
auto x_row = x + 2*row*ne0;
|
||||
float g = x_row[j + ne0] / (1.0f + expf(-x_row[j + ne0]));
|
||||
g = min(g, limit);
|
||||
dst[i] = max(-limit, min(limit, x_row[j])) * g;
|
||||
}
|
||||
|
||||
static __global__ void fused_mul_relu_f32(const float * x, const float * y, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
@@ -223,9 +248,13 @@ static void swiglu_f32_cuda(const float * x, float * dst, const int k, const int
|
||||
}
|
||||
#endif
|
||||
|
||||
static void fused_mul_silu_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
||||
static void fused_mul_silu_f32_cuda(const float * x, const float * y, float * dst, const int k, float limit, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
||||
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
||||
if (limit < 1e-6f) {
|
||||
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
||||
} else {
|
||||
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k, limit);
|
||||
}
|
||||
}
|
||||
|
||||
static void fused_mul_relu_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
|
||||
@@ -238,9 +267,13 @@ static void fused_mul_gelu_f32_cuda(const float * x, const float * y, float * ds
|
||||
fused_mul_gelu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
||||
}
|
||||
|
||||
static void fused_mul_silu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
|
||||
static void fused_mul_silu_f32_cuda(const float * x, float * dst, const int k, const int ne0, float limit, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
||||
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
|
||||
if (limit < 1e-6f) {
|
||||
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
|
||||
} else {
|
||||
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0, limit);
|
||||
}
|
||||
}
|
||||
|
||||
static void fused_mul_relu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
|
||||
@@ -344,12 +377,12 @@ void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
#endif
|
||||
|
||||
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
|
||||
int64_t nelements, const float * src0_d, const float * src1_d, float * dst_d) {
|
||||
int64_t nelements, const float * src0_d, const float * src1_d, float * dst_d, float limit) {
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
switch (op) {
|
||||
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
|
||||
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, nelements, limit, stream); break;
|
||||
case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
|
||||
case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
|
||||
default: GGML_ASSERT(false);
|
||||
@@ -357,12 +390,12 @@ void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
|
||||
}
|
||||
|
||||
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
|
||||
int64_t nelements, int64_t ne0, const float * src0_d, float * dst_d) {
|
||||
int64_t nelements, int64_t ne0, const float * src0_d, float * dst_d, float limit) {
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
switch (op) {
|
||||
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
|
||||
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, dst_d, nelements, ne0, limit, stream); break;
|
||||
case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
|
||||
case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
|
||||
default: GGML_ASSERT(false);
|
||||
@@ -373,15 +406,16 @@ void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
ggml_unary_op op = (ggml_unary_op)dst->op_params[0];
|
||||
float limit = *(const float *)(dst->op_params + 1);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data);
|
||||
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data, limit);
|
||||
} else {
|
||||
GGML_ASSERT(src0->ne[0] == 2*dst->ne[0] && src0->ne[1] == dst->ne[1] && src0->ne[2] == dst->ne[2] && src0->ne[3] == dst->ne[3]);
|
||||
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), dst->ne[0], (const float *)src0->data, (float *)dst->data);
|
||||
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), dst->ne[0], (const float *)src0->data, (float *)dst->data, limit);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -87,9 +87,9 @@ void ggml_swiglu_oai_cuda_f32(const float * x, const float * g, float * dst, con
|
||||
void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
|
||||
int64_t nelements, const float * x, const float * y, float * z);
|
||||
int64_t nelements, const float * x, const float * y, float * z, float limit = 0);
|
||||
|
||||
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
|
||||
int64_t nelements,int64_t ne0, const float * x, float * z);
|
||||
int64_t nelements,int64_t ne0, const float * x, float * z, float limit = 0);
|
||||
|
||||
void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -15140,6 +15140,8 @@ static void ggml_compute_forward_fused_mul_unary_f32(
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
enum ggml_unary_op op = (enum ggml_unary_op)dst->op_params[0];
|
||||
const float limit = *(const float *)(dst->op_params + 1);
|
||||
if (params->ith == 0) printf("%s(%s) using limit = %g\n", __func__, dst->name, (double)limit);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
@@ -15167,7 +15169,19 @@ static void ggml_compute_forward_fused_mul_unary_f32(
|
||||
switch (op) {
|
||||
case GGML_UNARY_OP_GELU: ggml_vec_gelu_f32(nc, z, x); ggml_vec_mul_f32(nc, z, z, y); break;
|
||||
case GGML_UNARY_OP_RELU: ggml_vec_relu_f32(nc, z, x); ggml_vec_mul_f32(nc, z, z, y); break;
|
||||
case GGML_UNARY_OP_SILU: ggml_vec_mul_silu_f32(nc, z, x, y); break;
|
||||
case GGML_UNARY_OP_SILU: {
|
||||
if (limit < 1e-6f) {
|
||||
ggml_vec_mul_silu_f32(nc, z, x, y);
|
||||
} else {
|
||||
// TODO: simdify this
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
float gate = ggml_silu_f32(x[i]);
|
||||
gate = MIN(gate, limit);
|
||||
float up = MAX(-limit, MIN(limit, y[i]));
|
||||
z[i] = up * gate;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
@@ -16673,6 +16687,7 @@ static void ggml_compute_forward_mul_mat_id_up_gate(
|
||||
|
||||
ggml_barrier(params->shared);
|
||||
|
||||
const float limit = *(const float *)(dst->op_params + 1);
|
||||
|
||||
// so GGML_TENSOR_BINARY_OP_LOCALS works
|
||||
|
||||
@@ -16704,7 +16719,7 @@ static void ggml_compute_forward_mul_mat_id_up_gate(
|
||||
vec_dot_type, (const char *)wdata, row_size,
|
||||
up_b_cur, gate_b_cur,
|
||||
(float *)dst->data, nb1, nb2,
|
||||
matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
|
||||
matrix_rows + cur_a*ne12, limit, ith, nth)) GGML_ABORT("fatal error");
|
||||
|
||||
}
|
||||
|
||||
@@ -16769,6 +16784,8 @@ static void ggml_compute_forward_mul_mat_up_gate(
|
||||
|
||||
ggml_barrier(params->shared);
|
||||
|
||||
float limit = *(const float *)(dst->op_params + 1);
|
||||
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
if (!iqk_moe_fused_up_gate(ne01, ne11, ne00, ne11, dst->op_params[0],
|
||||
@@ -16776,7 +16793,7 @@ static void ggml_compute_forward_mul_mat_up_gate(
|
||||
vec_dot_type, (const char *)wdata, row_size,
|
||||
NULL, NULL,
|
||||
(float *)dst->data, nb1, nb2,
|
||||
NULL, ith, nth)) GGML_ABORT("fatal error");
|
||||
NULL, limit, ith, nth)) GGML_ABORT("fatal error");
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -135,7 +135,7 @@ struct MulMat {
|
||||
}
|
||||
inline void mul_mat_up_gate_NxM(int n, const void * vx_up, const void * vx_gate, size_t bx,
|
||||
const float * up_b, const float * gate_b,
|
||||
DataInfo& info, int nrc_x, int nrc_y, int unary_op) {
|
||||
DataInfo& info, int nrc_x, int nrc_y, int unary_op, float limit) {
|
||||
#ifdef __aarch64__
|
||||
constexpr int k_x_step = 64; //8192; // Tiling does not seem to help on my M2 Max (but difference to tiling is small)
|
||||
#else
|
||||
@@ -143,7 +143,7 @@ struct MulMat {
|
||||
#endif
|
||||
auto op = ggml_unary_op(unary_op);
|
||||
float tmp[k_x_step*16];
|
||||
auto process = [&tmp, n, op, vx_gate, vx_up, gate_b, up_b, bx, xstep = k_x_step] (mul_mat_t func, const DataInfo& this_info, int ix, int this_nrc_x, int ny) {
|
||||
auto process = [&tmp, n, op, vx_gate, vx_up, gate_b, up_b, bx, xstep = k_x_step, limit] (mul_mat_t func, const DataInfo& this_info, int ix, int this_nrc_x, int ny) {
|
||||
func(n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
|
||||
for (int ky = 0; ky < ny; ++ky) {
|
||||
if (gate_b) {
|
||||
@@ -152,6 +152,9 @@ struct MulMat {
|
||||
for (int j = 0; j < this_nrc_x; ++j) x[j] += b[j];
|
||||
}
|
||||
activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*xstep);
|
||||
if (limit > 1e-6f) {
|
||||
for (int j = 0; j < this_nrc_x; ++j) tmp[ky*xstep + j] = std::min(tmp[ky*xstep + j], limit);
|
||||
}
|
||||
}
|
||||
func(n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
|
||||
for (int ky = 0; ky < ny; ++ky) {
|
||||
@@ -162,6 +165,8 @@ struct MulMat {
|
||||
}
|
||||
if (op == GGML_UNARY_OP_SWIGLU_OAI) {
|
||||
clamp_oai(this_nrc_x, result);
|
||||
} else if (limit > 1e-6f) {
|
||||
for (int j = 0; j < this_nrc_x; ++j) result[j] = std::max(-limit, std::min(limit, result[j]));
|
||||
}
|
||||
for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*xstep + j];
|
||||
}
|
||||
@@ -738,7 +743,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
|
||||
int typeA, const void * Aup, const void * Agate, long strideA,
|
||||
int typeB, const void * B, long strideB,
|
||||
const char * up_b_c, const char * gate_b_c,
|
||||
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
|
||||
float * C, long nb1, long nb2, const void * vrow_mapping, float limit, int ith, int nth) {
|
||||
|
||||
const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
|
||||
//assert(row_mapping != nullptr);
|
||||
@@ -781,7 +786,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
|
||||
}
|
||||
auto up_b = up_b_c ? (const float *)up_b_c + first_x + ix : nullptr;
|
||||
auto gate_b = gate_b_c ? (const float *)gate_b_c + first_x + ix : nullptr;
|
||||
mm.mul_mat_up_gate_NxM(ne00, Xu, Xg, row_size_qx, up_b, gate_b, this_info, this_nrc_x, Ny, unary_op);
|
||||
mm.mul_mat_up_gate_NxM(ne00, Xu, Xg, row_size_qx, up_b, gate_b, this_info, this_nrc_x, Ny, unary_op, limit);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -806,7 +811,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
|
||||
auto up_b = up_b_c ? (const float *)up_b_c + first_x : nullptr;
|
||||
auto gate_b = gate_b_c ? (const float *)gate_b_c + first_x : nullptr;
|
||||
mm.mul_mat_up_gate_NxM(ne00, (const char *)Aup + row_size_qx*first_x, (const char *)Agate + row_size_qx*first_x, row_size_qx,
|
||||
up_b, gate_b, info, nrc_x, Ny, unary_op);
|
||||
up_b, gate_b, info, nrc_x, Ny, unary_op, limit);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1406,7 +1411,7 @@ extern "C" IQK_API bool iqk_mul_mat_moe(long, long, long, int, int, const void *
|
||||
extern "C" IQK_API bool iqk_moe_fused_up_gate(long /*Nx*/, long /*Ny*/, long /*ne00*/, int /*ne11*/, int /*unary_op*/,
|
||||
int /*typeA*/, const void * /*Aup*/, const void * /*Agate*/, long /*strideA*/,
|
||||
int /*typeB*/, const void * /*B*/, long /*strideB*/,
|
||||
float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, int /*ith*/, int /*nth*/) {
|
||||
float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, float, int /*ith*/, int /*nth*/) {
|
||||
GGML_ABORT("Unsupported CPU. You may need to manually set compilation flags\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int ne11, int un
|
||||
int typeA, const void * Aup, const void * Agate, long strideA,
|
||||
int typeB, const void * B, long strideB,
|
||||
const char * up_b, const char * gate_b,
|
||||
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth);
|
||||
float * C, long nb1, long nb2, const void * vrow_mapping, float limit, int ith, int nth);
|
||||
|
||||
IQK_API int iqk_dequant_type(int type, int Ny);
|
||||
|
||||
|
||||
@@ -71,6 +71,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
||||
{ LLM_ARCH_STEP35, "step35" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -128,6 +129,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
||||
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
|
||||
{ LLM_KV_SWIGLU_LIMITS, "%s.swiglu_limits" },
|
||||
{ LLM_KV_SWIGLU_LIMITS_SHARED, "%s.swiglu_limits_shared" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
@@ -151,8 +154,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,"%s.rope.dimension_count_per_layer" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE_PER_LAYER, "%s.rope.freq_base_per_layer" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
|
||||
@@ -70,6 +70,7 @@ enum llm_arch {
|
||||
LLM_ARCH_MISTRAL3,
|
||||
LLM_ARCH_MIMO2,
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -121,6 +122,8 @@ enum llm_kv {
|
||||
LLM_KV_EMBEDDING_SCALE,
|
||||
LLM_KV_TOKEN_SHIFT_COUNT,
|
||||
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
|
||||
LLM_KV_SWIGLU_LIMITS,
|
||||
LLM_KV_SWIGLU_LIMITS_SHARED,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
@@ -144,9 +147,11 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_FREQ_BASE_SWA,
|
||||
LLM_KV_ROPE_FREQ_BASE_PER_LAYER,
|
||||
LLM_KV_ROPE_SCALE_LINEAR,
|
||||
LLM_KV_ROPE_SCALING_TYPE,
|
||||
LLM_KV_ROPE_SCALING_FACTOR,
|
||||
@@ -238,6 +243,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||
LLM_TENSOR_ATTN_SINKS,
|
||||
LLM_TENSOR_ATTN_GATE,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
|
||||
@@ -755,6 +755,9 @@ ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
|
||||
cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
|
||||
cb(cur, "ffn_up_gate", il);
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
*(float *)(cur->op_params + 1) = lctx.model.hparams.swiglu_limits_shared[il];
|
||||
}
|
||||
if (down) {
|
||||
cur = llm_build_lora_mm(lctx, ctx, down, cur);
|
||||
if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) {
|
||||
@@ -828,12 +831,21 @@ ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
|
||||
cur = ggml_fused_mul_unary(ctx, cur, tmp, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
|
||||
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU);
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
*((float *)(cur->op_params + 1)) = lctx.model.hparams.swiglu_limits_shared[il];
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
switch (type_op) {
|
||||
case LLM_FFN_SILU:
|
||||
{
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
cur = ggml_fused_mul_unary(ctx, cur, up, GGML_UNARY_OP_SILU);
|
||||
*(float *)(cur->op_params + 1) = lctx.model.hparams.swiglu_limits_shared[il];
|
||||
type_gate = LLM_FFN_SEQ;
|
||||
break;
|
||||
}
|
||||
cur = ggml_silu(ctx, cur);
|
||||
cb(cur, "ffn_silu", il);
|
||||
} break;
|
||||
@@ -1003,7 +1015,7 @@ llm_expert_gating_func_type gating_op,
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
||||
|
||||
if (lctx.model.arch == LLM_ARCH_BAILINGMOE2) {
|
||||
if (lctx.model.arch == LLM_ARCH_BAILINGMOE2 || lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
weights_sum = ggml_scale_bias(ctx, weights_sum, 1.0, 1e-20);
|
||||
cb(weights_sum, "ffn_moe_weights_sum_biased", il);
|
||||
}
|
||||
@@ -1036,7 +1048,7 @@ llm_expert_gating_func_type gating_op,
|
||||
// Hence, if we have biases, we cannot use fmoe.
|
||||
//
|
||||
//bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU);
|
||||
bool can_use_fmoe = type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE;
|
||||
bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE);
|
||||
|
||||
ggml_tensor * par;
|
||||
if (can_use_fmoe && up_gate_exps) {
|
||||
@@ -1049,6 +1061,9 @@ llm_expert_gating_func_type gating_op,
|
||||
par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
|
||||
}
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
*((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
|
||||
|
||||
@@ -1062,6 +1077,9 @@ llm_expert_gating_func_type gating_op,
|
||||
par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
|
||||
}
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
*(float *)(par->op_params + 1) = lctx.model.hparams.swiglu_limits[il];
|
||||
}
|
||||
} else {
|
||||
ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
@@ -1087,6 +1105,9 @@ llm_expert_gating_func_type gating_op,
|
||||
|
||||
if (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU) {
|
||||
par = ggml_fused_mul_unary(ctx, gate, up, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
*((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
|
||||
}
|
||||
} else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) {
|
||||
constexpr float alpha = 1.702f;
|
||||
constexpr float limit = 7.0f;
|
||||
@@ -1655,8 +1676,10 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
|
||||
ggml_tensor * wk, ggml_tensor * bk,
|
||||
ggml_tensor * wv, ggml_tensor * bv,
|
||||
ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, bool add_graph_split) const {
|
||||
int n_head = hparams.n_head(il);
|
||||
int n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il);
|
||||
if (wqkv) {
|
||||
auto qkv = llm_build_lora_mm(lctx, ctx0, wqkv, cur);
|
||||
if (add_graph_split) {
|
||||
@@ -3555,6 +3578,147 @@ ggml_cgraph * llm_build_context::build_seedoss() {
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph * llm_build_context::build_step35() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
ggml_tensor * cur;
|
||||
auto inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
auto inp_pos = build_inp_pos();
|
||||
auto inp_out_ids = build_inp_out_ids();
|
||||
auto KQ_mask = build_inp_KQ_mask();
|
||||
auto KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||
//const float kq_scale = 1.0f / sqrtf(float(n_rot));
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
bool is_swa = hparams.swa_layers[il];
|
||||
ggml_tensor * inpSA = inpL;
|
||||
const uint32_t n_head_l = hparams.n_head(il);
|
||||
const float freq_base_l = hparams.has_rope_freq_base_per_layer ? hparams.rope_freq_base_per_layer[il] :
|
||||
is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||
cur = inpL;
|
||||
// self-attention
|
||||
{
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur,
|
||||
model.layers[il].wqkv, model.layers[il].bqkv,
|
||||
model.layers[il].wqk, model.layers[il].bqk,
|
||||
model.layers[il].wq, model.layers[il].bq,
|
||||
model.layers[il].wk, model.layers[il].bk,
|
||||
model.layers[il].wv, model.layers[il].bv,
|
||||
model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0.f, il);
|
||||
|
||||
ggml_tensor * rope_factors = nullptr;
|
||||
const uint32_t apply_mask = hparams.rope_scaling_apply_mask;
|
||||
if ((is_swa && (apply_mask & 0x2)) || (!is_swa && (apply_mask & 0x1))) {
|
||||
rope_factors = build_rope_factors(il);
|
||||
}
|
||||
const int64_t n_rot_l = hparams.rope_n_rot(il);
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_pos", il);
|
||||
cb(Kcur, "Kcur_pos", il);
|
||||
|
||||
const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k));
|
||||
auto attn_out = llm_build_kv(ctx0, lctx, kv_self, gf, nullptr, nullptr, // i.e., do not multiply with wo
|
||||
Kcur, Vcur, Qcur, is_swa ? KQ_mask_swa : KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il,
|
||||
nullptr, is_swa ? hparams.n_swa : 0);
|
||||
cb(attn_out, "attn_out", il);
|
||||
|
||||
// head-wise attention gate: sigmoid(g_proj(x)) in torch
|
||||
if (model.layers[il].wqkv_gate) {
|
||||
auto gate = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, cur); // [n_head_l, n_tokens]
|
||||
cb(gate, "attn_gate", il);
|
||||
gate = ggml_sigmoid(ctx0, gate);
|
||||
cb(gate, "attn_gate_sigmoid", il);
|
||||
// reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens]
|
||||
ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens);
|
||||
ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
|
||||
gate_3d = ggml_repeat(ctx0, gate_3d, attn_3d);
|
||||
cb(gate_3d, "attn_gate_bcast", il);
|
||||
attn_3d = ggml_mul(ctx0, attn_3d, gate_3d);
|
||||
cb(attn_3d, "attn_gated_3d", il);
|
||||
//attn_out = ggml_cont_2d(ctx0, ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens),
|
||||
// n_embd_head_v * n_head_l, n_tokens);
|
||||
attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens);
|
||||
cb(attn_out, "attn_gated", il);
|
||||
}
|
||||
// output projection
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, attn_out);
|
||||
cb(cur, "attn_proj", il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids && n_tokens > 1) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
// feed-forward
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
// dense MLP
|
||||
cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE routed experts
|
||||
const bool norm_w = hparams.expert_weights_norm;
|
||||
const float w_scale = hparams.expert_weights_scale;
|
||||
const bool scale_w = w_scale != 0.0f;
|
||||
ggml_tensor * moe_out = llm_build_moe_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU,
|
||||
norm_w, scale_w, w_scale,
|
||||
LLM_EXPERT_GATING_FUNC_SIGMOID,
|
||||
cb, il, gf, false, model.layers[il].ffn_up_gate_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
// shared expert MLP (always added on MoE layers in Step35)
|
||||
ggml_tensor * sh_out = llm_build_ffn(ctx0, lctx, nullptr, cur,
|
||||
model.layers[il].ffn_up_shexp, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate_shexp, nullptr, nullptr,
|
||||
model.layers[il].ffn_down_shexp, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
|
||||
cb(sh_out, "ffn_shared_out", il);
|
||||
cur = ggml_add(ctx0, moe_out, sh_out);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out_with_inp", il);
|
||||
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph * llm_build_context::build_qwen() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
@@ -9360,6 +9524,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
|
||||
{
|
||||
result = llm.build_seedoss();
|
||||
} break;
|
||||
case LLM_ARCH_STEP35:
|
||||
{
|
||||
result = llm.build_step35();
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
@@ -280,6 +280,8 @@ struct llm_build_context {
|
||||
|
||||
ggml_cgraph * build_seedoss();
|
||||
|
||||
ggml_cgraph * build_step35();
|
||||
|
||||
//
|
||||
static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0,
|
||||
ggml_tensor * w, ggml_tensor * cur);
|
||||
|
||||
@@ -1115,6 +1115,38 @@ void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STEP35:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
//hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
// MoE + SWA parameters
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
// Step35 uses sigmoid gating by default (if not set in GGUF)
|
||||
if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID;
|
||||
}
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_limits, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer);
|
||||
// Optional: Step35-only gating for applying rope scaling (HF: yarn_only_types).
|
||||
// Default is 3 (apply on all layers) if the key is absent.
|
||||
//ml.get_key(format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()),
|
||||
// hparams.rope_scaling_apply_mask, false);
|
||||
//hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr(
|
||||
// format("%s.rope.freq_base_per_layer", ml.get_arch_name().c_str()),
|
||||
// hparams.rope_freq_base_per_layer, hparams.n_layer, false);
|
||||
ml.get_key(format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()),
|
||||
hparams.rope_scaling_apply_mask, false);
|
||||
hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr(LLM_KV_ROPE_FREQ_BASE_PER_LAYER,
|
||||
hparams.rope_freq_base_per_layer, hparams.n_layer, false);
|
||||
//type = LLM_TYPE_UNKNOWN; <--- what is this?
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
|
||||
@@ -70,6 +70,8 @@ struct llama_hparams {
|
||||
float rope_freq_base_train_swa;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
uint32_t rope_scaling_apply_mask = 0x3;
|
||||
bool has_rope_freq_base_per_layer = false;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
@@ -79,6 +81,8 @@ struct llama_hparams {
|
||||
float yarn_beta_slow = 1.0f;
|
||||
|
||||
std::array<int, 4> rope_sections;
|
||||
std::array<float, LLAMA_MAX_LAYERS> rope_freq_base_per_layer;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> rope_dim_per_layer;
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
@@ -124,6 +128,9 @@ struct llama_hparams {
|
||||
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
|
||||
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_limits;
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_limits_shared;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
@@ -265,6 +272,11 @@ struct llama_hparams {
|
||||
return std::fabs(b - a) <= abs_tol;
|
||||
}
|
||||
|
||||
uint32_t rope_n_rot(uint32_t il) const {
|
||||
const uint32_t v = rope_dim_per_layer[il];
|
||||
return v ? v : n_rot;
|
||||
}
|
||||
|
||||
static const char * rope_scaling_type_name(llama_rope_scaling_type);
|
||||
|
||||
};
|
||||
|
||||
@@ -141,6 +141,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
bool create_seedoss_tensors(const LLM_TN & tn);
|
||||
|
||||
bool create_step35_tensors(const LLM_TN & tn);
|
||||
|
||||
llama_model_loader & ml;
|
||||
llama_model & model;
|
||||
|
||||
@@ -1026,6 +1028,74 @@ bool create_tensors_helper::create_seedoss_tensors(const LLM_TN & tn) {
|
||||
return use_mmap_buffer;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) {
|
||||
LOADING_PRELUDE
|
||||
|
||||
model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
// output
|
||||
model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
// STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
|
||||
// ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
|
||||
uint32_t n_rot_max = 0;
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
n_rot_max = std::max(n_rot_max, hparams.rope_n_rot(i));
|
||||
}
|
||||
if (n_rot_max == 0) {
|
||||
n_rot_max = n_rot;
|
||||
}
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
auto & layer = model.layers[i];
|
||||
const uint32_t n_head_l = hparams.n_head(i);
|
||||
layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
// optional rope factors (llama3) / longrope tensors
|
||||
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
||||
layer.rope_long = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
layer.rope_short = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
} else {
|
||||
layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
}
|
||||
use_mmap_buffer &= !merge_qkv(tn, i, 0);
|
||||
//layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
|
||||
//layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
//layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
|
||||
// head-wise attention gate (Step35 self_attn.g_proj)
|
||||
layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
// dense MLP (leading dense blocks)
|
||||
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
// MoE routed experts + selection bias (router_bias)
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
if (!layer.ffn_gate) {
|
||||
layer.ffn_gate_inp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, n_ff_exp);
|
||||
//layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert},
|
||||
// llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
//layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert},
|
||||
// llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
//layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert},
|
||||
// llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_exp_probs_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
// shared expert MLP
|
||||
layer.ffn_gate_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
return use_mmap_buffer;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::create_qwen_tensors(const LLM_TN & tn) {
|
||||
LOADING_PRELUDE
|
||||
create_embd_output(tn, n_embd, n_vocab);
|
||||
@@ -2784,10 +2854,10 @@ bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & t
|
||||
|
||||
bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool ignore_attn_scale) {
|
||||
auto& hparams = model.hparams;
|
||||
const int64_t n_head = hparams.n_head();
|
||||
const int64_t n_head_kv = hparams.n_head_kv();
|
||||
const int64_t n_head = hparams.n_head(i);
|
||||
const int64_t n_head_kv = hparams.n_head_kv(i);
|
||||
const int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); // For Qwen3-VL we need to divide by the number of deepstack layers + 1, for other models n_deepstack_layers value is 0 by default
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
|
||||
@@ -3111,6 +3181,8 @@ bool create_tensors_helper::create_tensors() {
|
||||
use_mmap_buffer = create_mimo2_tensors(tn); break;
|
||||
case LLM_ARCH_SEED_OSS:
|
||||
use_mmap_buffer = create_seedoss_tensors(tn); break;
|
||||
case LLM_ARCH_STEP35:
|
||||
use_mmap_buffer = create_step35_tensors(tn); break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
||||
@@ -1083,6 +1083,7 @@ template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::str
|
||||
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
|
||||
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
|
||||
|
||||
template std::enable_if<std::is_integral<unsigned int>::value, bool>::type llama_model_loader::get_arr_n<unsigned int>(enum llm_kv, unsigned int&, bool);
|
||||
|
||||
|
||||
@@ -1334,6 +1334,37 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_STEP35,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_SHORT,"rope_factors_short" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
||||
@@ -149,6 +149,7 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_sub_norm = nullptr;
|
||||
struct ggml_tensor * attn_norm_cross = nullptr;
|
||||
struct ggml_tensor * attn_norm_enc = nullptr;
|
||||
struct ggml_tensor * wqkv_gate = nullptr;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * wq = nullptr;
|
||||
|
||||
@@ -5051,6 +5051,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_MINIMAX_M2:
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_SEED_OSS:
|
||||
case LLM_ARCH_STEP35:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
|
||||
Reference in New Issue
Block a user