Step-3.5-Flash support (#1231)

* WIP

* This works but is slow

* Turn off the up / gate clamps for now

* OK we need the clamping

* Fuse the clamp (CUDA)

* Fuse the clamp (CPU)

* WIP

* Be able to use merged q, k, v

* Be able to use merged up/gate experts

* Fuse the clamp (CUDA mmvq)
This commit is contained in:
Kawrakow
2026-02-05 08:13:22 +02:00
committed by GitHub
parent 8d952ff183
commit 9c1c74acda
22 changed files with 487 additions and 69 deletions

View File

@@ -2626,12 +2626,13 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id;
auto unary_op = (ggml_unary_op)dst->op_params[0];
float limit = *(const float *)(dst->op_params + 1);
if (src0_2) {
ggml_cuda_op_fused_mul_mat_vec_q_id(ctx, src0_1, &local_src1, ids, &local_dst,
dst->src[4], dst->src[5],
(const char *)src0_1->data, src0_2 ? (const char *)src0_2->data : nullptr,
(const float *)src1->data, src1_quantized.get(),
(float *)local_dst.data, 0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, stream);
(float *)local_dst.data, 0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
} else {
auto local_src0_1 = *src0_1;
local_src0_1.ne[1] /= 2;
@@ -2642,7 +2643,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
nullptr, nullptr,
(const char *)local_src0_1.data, (const char *)local_src0_2.data,
(const float *)src1->data, src1_quantized.get(),
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, stream);
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
} else {
GGML_ASSERT(!dst->src[5]);
auto local_bias_1 = *dst->src[4];
@@ -2653,7 +2654,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
&local_bias_1, &local_bias_2,
(const char *)local_src0_1.data, (const char *)local_src0_2.data,
(const float *)src1->data, src1_quantized.get(),
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, stream);
(float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
}
}
CUDA_CHECK(cudaGetLastError());
@@ -2773,9 +2774,11 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
(float *)dst->data, ggml_nelements(dst), dst_row.ne[0], dst_row.ne[0], dst_row.ne[0],
1.702f, 7.0f, stream);
} else {
float limit = *((const float *)(dst->op_params + 1));
//printf("%s: using limit = %g\n", __func__, limit);
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
(float *)dst->data);
(float *)dst->data, limit);
}
} else {
@@ -2801,8 +2804,10 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
(float *)dst->data, ggml_nelements(dst), dst->ne[0], src0_1->ne[1], src0_1->ne[1],
1.702f, 7.0f, stream);
} else {
float limit = *((const float *)(dst->op_params + 1));
//printf("%s: using limit = %g\n", __func__, limit);
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst), dst->ne[0],
(const float *)dst_up_gate_contiguous.get(), (float *)dst->data);
(const float *)dst_up_gate_contiguous.get(), (float *)dst->data, limit);
}
}
CUDA_CHECK(cudaGetLastError());
@@ -2970,6 +2975,8 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
}
auto unary_op = (ggml_unary_op)dst->op_params[0];
float limit = *(const float *)(dst->op_params + 1);
//printf("%s: using limit = %g\n", __func__, limit);
if (src0_2) {
dst_row.data = dst_gate_contiguous.get();
if (use_quantized_src1) {
@@ -2993,7 +3000,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
} else {
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
(const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
(float *)dst_gate_contiguous.get());
(float *)dst_gate_contiguous.get(), limit);
}
} else {
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
@@ -3002,7 +3009,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
1.702f, 7.0f, stream);
} else {
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row)/2, dst->ne[0],
(const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get());
(const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get(), limit);
}
dst_row.data = dst_gate_contiguous.get();
dst_row.ne[0] /= 2;
@@ -3065,6 +3072,8 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
auto stream = ctx.stream();
float limit = *(const float *)(dst->op_params + 1);
auto ne10_padded = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
auto nb10_padded = ne10_padded*sizeof(block_q8_1)/QK8_1;
auto quantized_size = nb10_padded*src1->ne[1];
@@ -3083,7 +3092,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
dst->src[4], dst->src[5],
(const char *)src0_1->data, (const char *)src0_2->data, (const float *)src1->data, src1_quantized.get(),
(float *)dst->data, 0, src0_1->ne[1], 1, ne10_padded,
(ggml_unary_op)dst->op_params[0], stream);
(ggml_unary_op)dst->op_params[0], limit, stream);
return;
}
@@ -3116,8 +3125,9 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
}
}
//printf("%s: using limit = %g\n", __func__, limit);
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
(const float *)dst->data, dst_up.get(), (float *)dst->data);
(const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
CUDA_CHECK(cudaGetLastError());
}

View File

@@ -105,7 +105,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
const void * __restrict__ vup, const void * __restrict__ vgate, const void * __restrict__ vy, float * __restrict__ dst,
const float * __restrict__ bias_u, const float * __restrict__ bias_g,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size,
ggml_unary_op unary_op) {
ggml_unary_op unary_op, float limit) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
@@ -191,7 +191,12 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
float g = tmp_g[j][threadIdx.x];
float r;
switch (unary_op) {
case GGML_UNARY_OP_SILU: r = u*g/(1 + expf(-g)); break;
case GGML_UNARY_OP_SILU:
{
g = g/(1 + expf(-g));
g = min(g, limit);
r = max(-limit, min(limit, u))*g;
} break;
case GGML_UNARY_OP_RELU: r = fmaxf(g, 0.0f) * u; break;
case GGML_UNARY_OP_GELU: {
constexpr float GELU_COEF_A = 0.044715f;
@@ -243,7 +248,7 @@ static __global__ void iqk_fused_mul_mat_vec_q(
const void * __restrict__ vx_u, const void * __restrict__ vx_g, const void * __restrict__ vy, float * __restrict__ dst,
const char * __restrict__ ids_data, const void * __restrict__ bias_u, const void * __restrict__ bias_g, const uint64_t bias_nb1,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size,
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op) {
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op, float limit) {
int i2 = blockIdx.y;
int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
@@ -256,7 +261,7 @@ static __global__ void iqk_fused_mul_mat_vec_q(
char * cdst = (char *)dst + i2*nb2;
iqk_fused_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(
cx_u, cx_g, cy, (float *)cdst, cx_u_b, cx_g_b,
ncols_x, nrows_x, nrows_y, nrows_dst, row_size, unary_op);
ncols_x, nrows_x, nrows_y, nrows_dst, row_size, unary_op, limit);
}
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int n_interleaved = 1>
@@ -307,56 +312,56 @@ static void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream)
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
case 2:
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 2, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
case 3:
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 3, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
case 4:
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 4, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
case 5:
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 5, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
case 6:
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 6, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
case 7:
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 7, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
case 8:
iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 8, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
args.vx_u, args.vx_g, args.vy, args.dst,
args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
break;
default:
GGML_ASSERT(false);

View File

@@ -22,5 +22,6 @@ struct mmvq_args {
const uint64_t ids_nb0;
const uint64_t bias_nb1;
ggml_unary_op unary_op;
float limit;
};

View File

@@ -154,7 +154,7 @@ static __device__ void fused_mul_mat_vec_q(
const void * __restrict__ vup, const void * __restrict__ vgate,
const float * __restrict__ bias_u, const float * __restrict__ bias_g,
const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, ggml_unary_op unary_op) {
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, ggml_unary_op unary_op, float limit) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
@@ -243,7 +243,12 @@ static __device__ void fused_mul_mat_vec_q(
float g = tmp_g[j][threadIdx.x];
float r;
switch (unary_op) {
case GGML_UNARY_OP_SILU: r = u*g/(1 + expf(-g)); break;
case GGML_UNARY_OP_SILU:
{
g = g/(1 + expf(-g));
g = min(g, limit);
r = max(-limit, min(limit, u))*g;
}break;
case GGML_UNARY_OP_RELU: r = fmaxf(g, 0.0f) * u; break;
case GGML_UNARY_OP_GELU: {
constexpr float GELU_COEF_A = 0.044715f;
@@ -299,7 +304,7 @@ static __global__ void fused_mul_mat_vec_q(
const void * __restrict__ vy, float * __restrict__ dst, const char * __restrict__ ids_data,
const void * __restrict__ bias_u, const void * __restrict__ bias_g, const uint64_t bias_nb1,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst,
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op) {
const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op, float limit) {
int i2 = blockIdx.y;
char * cdst = (char *)dst + i2*nb2;
@@ -312,7 +317,8 @@ static __global__ void fused_mul_mat_vec_q(
const float * cx_u_b = bias_u ? (const float *)((const char *)bias_u + i02*bias_nb1) : nullptr;
const float * cx_g_b = bias_g ? (const float *)((const char *)bias_g + i02*bias_nb1) : nullptr;
const char * cy = (const char *)vy + i2*nb12;
fused_mul_mat_vec_q<type, ncols_y, nwarps>(cx_u, cx_g, cx_u_b, cx_g_b, cy, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, unary_op);
fused_mul_mat_vec_q<type, ncols_y, nwarps>(cx_u, cx_g, cx_u_b, cx_g_b, cy, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst,
unary_op, limit);
}
template <ggml_type type, int nwarps>
@@ -335,42 +341,50 @@ static void mul_mat_vec_q_cuda_T(const mmvq_args & args, cudaStream_t stream) {
case 1:
fused_mul_mat_vec_q<type, 1, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
case 2:
fused_mul_mat_vec_q<type, 2, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
case 3:
fused_mul_mat_vec_q<type, 3, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
case 4:
fused_mul_mat_vec_q<type, 4, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
case 5:
fused_mul_mat_vec_q<type, 5, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
case 6:
fused_mul_mat_vec_q<type, 6, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
case 7:
fused_mul_mat_vec_q<type, 7, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
case 8:
fused_mul_mat_vec_q<type, 8, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
args.unary_op, args.limit);
break;
default:
GGML_ABORT("fatal error");

View File

@@ -13,7 +13,7 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm
const char * src0_dd_u, const char * src0_dd_g, const char * src1_ddq_i, float * dst_dd_i, const char * ids_data,
const void * bias_u, const void * bias_g,
const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream) {
const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream) {
const int64_t row_diff = row_high - row_low;
@@ -41,7 +41,8 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm
/* nb2 */ uint64_t(nb2),
/* ids_nb0 */ uint64_t(ids_nb0),
/* bias_nb1 */ uint64_t(bias_nb1),
/* unary_op */ unary_op
/* unary_op */ unary_op,
/* limit */ limit > 1e-6f ? limit : INFINITY
};
switch (type) {
@@ -163,7 +164,7 @@ void ggml_cuda_op_mul_mat_vec_q_3D(
src0->nb[2], src1_row_size, dst->nb[2], 0, 0,
src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, nullptr, nullptr, nullptr,
row_low, row_high, src1_ncols,
src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);
GGML_UNUSED(src1_ddf_i);
}
@@ -199,7 +200,7 @@ void ggml_cuda_op_mul_mat_vec_q_biased(
ne00, ne0, 1, 0, 0, 0, 0, 0,
src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, nullptr, bias ? bias->data : nullptr, nullptr,
row_low, row_high, src1_ncols,
src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);
GGML_UNUSED(src1_ddf_i);
}
@@ -246,7 +247,7 @@ void ggml_cuda_op_mul_mat_vec_q_id(
src0->nb[2], src1->nb[2], dst->nb[2], ids->nb[0], bias ? bias->nb[1] : 0,
src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, (const char *)ids->data, bias ? bias->data : nullptr, nullptr,
row_low, row_high, src1_ncols,
src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);
GGML_UNUSED(src1_ddf_i);
}
@@ -256,7 +257,7 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
const ggml_tensor * bias_u, const ggml_tensor * bias_g,
const char * src0_dd_u, const char * src0_dd_g, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream) {
const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream) {
if (!bias_u && !bias_g) {
GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU ||
@@ -294,7 +295,7 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
src0_dd_u, src0_dd_g, src1_ddq_i, dst_dd_i, ids ? (const char *)ids->data : nullptr,
bias_u ? bias_u->data : nullptr, bias_g ? bias_g->data : nullptr,
row_low, row_high, src1_ncols,
src1_padded_row_size, unary_op, stream);
src1_padded_row_size, unary_op, limit, stream);
GGML_UNUSED(src1_ddf_i);
}

View File

@@ -40,4 +40,4 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
const ggml_tensor * bias_u, const ggml_tensor * bias_g,
const char * src0_dd_u, const char * src0_dd_g, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream);
const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream);

View File

@@ -61,6 +61,17 @@ static __global__ void fused_mul_silu_f32(const float * x, const float * y, floa
dst[i] = x[i] * y[i] / (1.0f + expf(-x[i]));
}
static __global__ void fused_mul_silu_f32(const float * x, const float * y, float * dst, const int k, float limit) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
float g = x[i] / (1.0f + expf(-x[i]));
g = min(g, limit);
dst[i] = g * max(-limit, min(limit, y[i]));
}
static __global__ void fused_mul_silu_f32(const float * x, float * dst, const int k, const int ne0) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -73,6 +84,20 @@ static __global__ void fused_mul_silu_f32(const float * x, float * dst, const in
dst[i] = x_row[j] * x_row[j + ne0] / (1.0f + expf(-x_row[j + ne0]));
}
static __global__ void fused_mul_silu_f32(const float * x, float * dst, const int k, const int ne0, float limit) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
int row = i / ne0;
int j = i % ne0;
auto x_row = x + 2*row*ne0;
float g = x_row[j + ne0] / (1.0f + expf(-x_row[j + ne0]));
g = min(g, limit);
dst[i] = max(-limit, min(limit, x_row[j])) * g;
}
static __global__ void fused_mul_relu_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -223,9 +248,13 @@ static void swiglu_f32_cuda(const float * x, float * dst, const int k, const int
}
#endif
static void fused_mul_silu_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
static void fused_mul_silu_f32_cuda(const float * x, const float * y, float * dst, const int k, float limit, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
if (limit < 1e-6f) {
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
} else {
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k, limit);
}
}
static void fused_mul_relu_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
@@ -238,9 +267,13 @@ static void fused_mul_gelu_f32_cuda(const float * x, const float * y, float * ds
fused_mul_gelu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
}
static void fused_mul_silu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
static void fused_mul_silu_f32_cuda(const float * x, float * dst, const int k, const int ne0, float limit, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
if (limit < 1e-6f) {
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
} else {
fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0, limit);
}
}
static void fused_mul_relu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
@@ -344,12 +377,12 @@ void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
#endif
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
int64_t nelements, const float * src0_d, const float * src1_d, float * dst_d) {
int64_t nelements, const float * src0_d, const float * src1_d, float * dst_d, float limit) {
cudaStream_t stream = ctx.stream();
switch (op) {
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, nelements, limit, stream); break;
case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
default: GGML_ASSERT(false);
@@ -357,12 +390,12 @@ void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
}
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
int64_t nelements, int64_t ne0, const float * src0_d, float * dst_d) {
int64_t nelements, int64_t ne0, const float * src0_d, float * dst_d, float limit) {
cudaStream_t stream = ctx.stream();
switch (op) {
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, dst_d, nelements, ne0, limit, stream); break;
case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
default: GGML_ASSERT(false);
@@ -373,15 +406,16 @@ void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor *
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
ggml_unary_op op = (ggml_unary_op)dst->op_params[0];
float limit = *(const float *)(dst->op_params + 1);
GGML_ASSERT(ggml_is_contiguous(src0));
if (src1) {
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src0, src1));
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data);
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data, limit);
} else {
GGML_ASSERT(src0->ne[0] == 2*dst->ne[0] && src0->ne[1] == dst->ne[1] && src0->ne[2] == dst->ne[2] && src0->ne[3] == dst->ne[3]);
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), dst->ne[0], (const float *)src0->data, (float *)dst->data);
ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), dst->ne[0], (const float *)src0->data, (float *)dst->data, limit);
}
}

View File

@@ -87,9 +87,9 @@ void ggml_swiglu_oai_cuda_f32(const float * x, const float * g, float * dst, con
void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
int64_t nelements, const float * x, const float * y, float * z);
int64_t nelements, const float * x, const float * y, float * z, float limit = 0);
void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
int64_t nelements,int64_t ne0, const float * x, float * z);
int64_t nelements,int64_t ne0, const float * x, float * z, float limit = 0);
void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -15140,6 +15140,8 @@ static void ggml_compute_forward_fused_mul_unary_f32(
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
enum ggml_unary_op op = (enum ggml_unary_op)dst->op_params[0];
const float limit = *(const float *)(dst->op_params + 1);
if (params->ith == 0) printf("%s(%s) using limit = %g\n", __func__, dst->name, (double)limit);
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -15167,7 +15169,19 @@ static void ggml_compute_forward_fused_mul_unary_f32(
switch (op) {
case GGML_UNARY_OP_GELU: ggml_vec_gelu_f32(nc, z, x); ggml_vec_mul_f32(nc, z, z, y); break;
case GGML_UNARY_OP_RELU: ggml_vec_relu_f32(nc, z, x); ggml_vec_mul_f32(nc, z, z, y); break;
case GGML_UNARY_OP_SILU: ggml_vec_mul_silu_f32(nc, z, x, y); break;
case GGML_UNARY_OP_SILU: {
if (limit < 1e-6f) {
ggml_vec_mul_silu_f32(nc, z, x, y);
} else {
// TODO: simdify this
for (int i = 0; i < nc; ++i) {
float gate = ggml_silu_f32(x[i]);
gate = MIN(gate, limit);
float up = MAX(-limit, MIN(limit, y[i]));
z[i] = up * gate;
}
}
} break;
default: GGML_ABORT("fatal error");
}
}
@@ -16673,6 +16687,7 @@ static void ggml_compute_forward_mul_mat_id_up_gate(
ggml_barrier(params->shared);
const float limit = *(const float *)(dst->op_params + 1);
// so GGML_TENSOR_BINARY_OP_LOCALS works
@@ -16704,7 +16719,7 @@ static void ggml_compute_forward_mul_mat_id_up_gate(
vec_dot_type, (const char *)wdata, row_size,
up_b_cur, gate_b_cur,
(float *)dst->data, nb1, nb2,
matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
matrix_rows + cur_a*ne12, limit, ith, nth)) GGML_ABORT("fatal error");
}
@@ -16769,6 +16784,8 @@ static void ggml_compute_forward_mul_mat_up_gate(
ggml_barrier(params->shared);
float limit = *(const float *)(dst->op_params + 1);
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
if (!iqk_moe_fused_up_gate(ne01, ne11, ne00, ne11, dst->op_params[0],
@@ -16776,7 +16793,7 @@ static void ggml_compute_forward_mul_mat_up_gate(
vec_dot_type, (const char *)wdata, row_size,
NULL, NULL,
(float *)dst->data, nb1, nb2,
NULL, ith, nth)) GGML_ABORT("fatal error");
NULL, limit, ith, nth)) GGML_ABORT("fatal error");
}
#endif

View File

@@ -135,7 +135,7 @@ struct MulMat {
}
inline void mul_mat_up_gate_NxM(int n, const void * vx_up, const void * vx_gate, size_t bx,
const float * up_b, const float * gate_b,
DataInfo& info, int nrc_x, int nrc_y, int unary_op) {
DataInfo& info, int nrc_x, int nrc_y, int unary_op, float limit) {
#ifdef __aarch64__
constexpr int k_x_step = 64; //8192; // Tiling does not seem to help on my M2 Max (but difference to tiling is small)
#else
@@ -143,7 +143,7 @@ struct MulMat {
#endif
auto op = ggml_unary_op(unary_op);
float tmp[k_x_step*16];
auto process = [&tmp, n, op, vx_gate, vx_up, gate_b, up_b, bx, xstep = k_x_step] (mul_mat_t func, const DataInfo& this_info, int ix, int this_nrc_x, int ny) {
auto process = [&tmp, n, op, vx_gate, vx_up, gate_b, up_b, bx, xstep = k_x_step, limit] (mul_mat_t func, const DataInfo& this_info, int ix, int this_nrc_x, int ny) {
func(n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
for (int ky = 0; ky < ny; ++ky) {
if (gate_b) {
@@ -152,6 +152,9 @@ struct MulMat {
for (int j = 0; j < this_nrc_x; ++j) x[j] += b[j];
}
activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*xstep);
if (limit > 1e-6f) {
for (int j = 0; j < this_nrc_x; ++j) tmp[ky*xstep + j] = std::min(tmp[ky*xstep + j], limit);
}
}
func(n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
for (int ky = 0; ky < ny; ++ky) {
@@ -162,6 +165,8 @@ struct MulMat {
}
if (op == GGML_UNARY_OP_SWIGLU_OAI) {
clamp_oai(this_nrc_x, result);
} else if (limit > 1e-6f) {
for (int j = 0; j < this_nrc_x; ++j) result[j] = std::max(-limit, std::min(limit, result[j]));
}
for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*xstep + j];
}
@@ -738,7 +743,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
int typeA, const void * Aup, const void * Agate, long strideA,
int typeB, const void * B, long strideB,
const char * up_b_c, const char * gate_b_c,
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
float * C, long nb1, long nb2, const void * vrow_mapping, float limit, int ith, int nth) {
const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
//assert(row_mapping != nullptr);
@@ -781,7 +786,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
}
auto up_b = up_b_c ? (const float *)up_b_c + first_x + ix : nullptr;
auto gate_b = gate_b_c ? (const float *)gate_b_c + first_x + ix : nullptr;
mm.mul_mat_up_gate_NxM(ne00, Xu, Xg, row_size_qx, up_b, gate_b, this_info, this_nrc_x, Ny, unary_op);
mm.mul_mat_up_gate_NxM(ne00, Xu, Xg, row_size_qx, up_b, gate_b, this_info, this_nrc_x, Ny, unary_op, limit);
}
return true;
@@ -806,7 +811,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
auto up_b = up_b_c ? (const float *)up_b_c + first_x : nullptr;
auto gate_b = gate_b_c ? (const float *)gate_b_c + first_x : nullptr;
mm.mul_mat_up_gate_NxM(ne00, (const char *)Aup + row_size_qx*first_x, (const char *)Agate + row_size_qx*first_x, row_size_qx,
up_b, gate_b, info, nrc_x, Ny, unary_op);
up_b, gate_b, info, nrc_x, Ny, unary_op, limit);
return true;
}
@@ -1406,7 +1411,7 @@ extern "C" IQK_API bool iqk_mul_mat_moe(long, long, long, int, int, const void *
extern "C" IQK_API bool iqk_moe_fused_up_gate(long /*Nx*/, long /*Ny*/, long /*ne00*/, int /*ne11*/, int /*unary_op*/,
int /*typeA*/, const void * /*Aup*/, const void * /*Agate*/, long /*strideA*/,
int /*typeB*/, const void * /*B*/, long /*strideB*/,
float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, int /*ith*/, int /*nth*/) {
float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, float, int /*ith*/, int /*nth*/) {
GGML_ABORT("Unsupported CPU. You may need to manually set compilation flags\n");
return false;
}

View File

@@ -34,7 +34,7 @@ IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int ne11, int un
int typeA, const void * Aup, const void * Agate, long strideA,
int typeB, const void * B, long strideB,
const char * up_b, const char * gate_b,
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth);
float * C, long nb1, long nb2, const void * vrow_mapping, float limit, int ith, int nth);
IQK_API int iqk_dequant_type(int type, int Ny);

View File

@@ -71,6 +71,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_MIMO2, "mimo2" },
{ LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_STEP35, "step35" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@@ -128,6 +129,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
{ LLM_KV_SWIGLU_LIMITS, "%s.swiglu_limits" },
{ LLM_KV_SWIGLU_LIMITS_SHARED, "%s.swiglu_limits_shared" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -151,8 +154,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,"%s.rope.dimension_count_per_layer" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_PER_LAYER, "%s.rope.freq_base_per_layer" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },

View File

@@ -70,6 +70,7 @@ enum llm_arch {
LLM_ARCH_MISTRAL3,
LLM_ARCH_MIMO2,
LLM_ARCH_SEED_OSS,
LLM_ARCH_STEP35,
LLM_ARCH_UNKNOWN,
};
@@ -121,6 +122,8 @@ enum llm_kv {
LLM_KV_EMBEDDING_SCALE,
LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
LLM_KV_SWIGLU_LIMITS,
LLM_KV_SWIGLU_LIMITS_SHARED,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -144,9 +147,11 @@ enum llm_kv {
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_FREQ_BASE_SWA,
LLM_KV_ROPE_FREQ_BASE_PER_LAYER,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,
@@ -238,6 +243,7 @@ enum llm_tensor {
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_ATTN_SINKS,
LLM_TENSOR_ATTN_GATE,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,

View File

@@ -755,6 +755,9 @@ ggml_tensor * llm_build_context::llm_build_ffn(
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
cb(cur, "ffn_up_gate", il);
if (lctx.model.arch == LLM_ARCH_STEP35) {
*(float *)(cur->op_params + 1) = lctx.model.hparams.swiglu_limits_shared[il];
}
if (down) {
cur = llm_build_lora_mm(lctx, ctx, down, cur);
if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) {
@@ -828,12 +831,21 @@ ggml_tensor * llm_build_context::llm_build_ffn(
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
cur = ggml_fused_mul_unary(ctx, cur, tmp, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU);
if (lctx.model.arch == LLM_ARCH_STEP35) {
*((float *)(cur->op_params + 1)) = lctx.model.hparams.swiglu_limits_shared[il];
}
}
else {
switch (type_op) {
case LLM_FFN_SILU:
{
if (lctx.model.arch == LLM_ARCH_STEP35) {
cur = ggml_fused_mul_unary(ctx, cur, up, GGML_UNARY_OP_SILU);
*(float *)(cur->op_params + 1) = lctx.model.hparams.swiglu_limits_shared[il];
type_gate = LLM_FFN_SEQ;
break;
}
cur = ggml_silu(ctx, cur);
cb(cur, "ffn_silu", il);
} break;
@@ -1003,7 +1015,7 @@ llm_expert_gating_func_type gating_op,
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
cb(weights_sum, "ffn_moe_weights_sum", il);
if (lctx.model.arch == LLM_ARCH_BAILINGMOE2) {
if (lctx.model.arch == LLM_ARCH_BAILINGMOE2 || lctx.model.arch == LLM_ARCH_STEP35) {
weights_sum = ggml_scale_bias(ctx, weights_sum, 1.0, 1e-20);
cb(weights_sum, "ffn_moe_weights_sum_biased", il);
}
@@ -1036,7 +1048,7 @@ llm_expert_gating_func_type gating_op,
// Hence, if we have biases, we cannot use fmoe.
//
//bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU);
bool can_use_fmoe = type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE;
bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE);
ggml_tensor * par;
if (can_use_fmoe && up_gate_exps) {
@@ -1049,6 +1061,9 @@ llm_expert_gating_func_type gating_op,
par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
}
if (lctx.model.arch == LLM_ARCH_STEP35) {
*((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
}
} else {
GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
@@ -1062,6 +1077,9 @@ llm_expert_gating_func_type gating_op,
par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
}
if (lctx.model.arch == LLM_ARCH_STEP35) {
*(float *)(par->op_params + 1) = lctx.model.hparams.swiglu_limits[il];
}
} else {
ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il);
@@ -1087,6 +1105,9 @@ llm_expert_gating_func_type gating_op,
if (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU) {
par = ggml_fused_mul_unary(ctx, gate, up, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
if (lctx.model.arch == LLM_ARCH_STEP35) {
*((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
}
} else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) {
constexpr float alpha = 1.702f;
constexpr float limit = 7.0f;
@@ -1655,8 +1676,10 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
ggml_tensor * wk, ggml_tensor * bk,
ggml_tensor * wv, ggml_tensor * bv,
ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, bool add_graph_split) const {
int n_head = hparams.n_head(il);
int n_head_kv = hparams.n_head_kv(il);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il);
if (wqkv) {
auto qkv = llm_build_lora_mm(lctx, ctx0, wqkv, cur);
if (add_graph_split) {
@@ -3555,6 +3578,147 @@ ggml_cgraph * llm_build_context::build_seedoss() {
return gf;
}
ggml_cgraph * llm_build_context::build_step35() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
ggml_tensor * cur;
auto inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
auto inp_pos = build_inp_pos();
auto inp_out_ids = build_inp_out_ids();
auto KQ_mask = build_inp_KQ_mask();
auto KQ_mask_swa = build_inp_KQ_mask_swa();
//const float kq_scale = 1.0f / sqrtf(float(n_rot));
for (int il = 0; il < n_layer; ++il) {
bool is_swa = hparams.swa_layers[il];
ggml_tensor * inpSA = inpL;
const uint32_t n_head_l = hparams.n_head(il);
const float freq_base_l = hparams.has_rope_freq_base_per_layer ? hparams.rope_freq_base_per_layer[il] :
is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
cur = inpL;
// self-attention
{
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur,
model.layers[il].wqkv, model.layers[il].bqkv,
model.layers[il].wqk, model.layers[il].bqk,
model.layers[il].wq, model.layers[il].bq,
model.layers[il].wk, model.layers[il].bk,
model.layers[il].wv, model.layers[il].bv,
model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0.f, il);
ggml_tensor * rope_factors = nullptr;
const uint32_t apply_mask = hparams.rope_scaling_apply_mask;
if ((is_swa && (apply_mask & 0x2)) || (!is_swa && (apply_mask & 0x1))) {
rope_factors = build_rope_factors(il);
}
const int64_t n_rot_l = hparams.rope_n_rot(il);
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_pos", il);
cb(Kcur, "Kcur_pos", il);
const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k));
auto attn_out = llm_build_kv(ctx0, lctx, kv_self, gf, nullptr, nullptr, // i.e., do not multiply with wo
Kcur, Vcur, Qcur, is_swa ? KQ_mask_swa : KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il,
nullptr, is_swa ? hparams.n_swa : 0);
cb(attn_out, "attn_out", il);
// head-wise attention gate: sigmoid(g_proj(x)) in torch
if (model.layers[il].wqkv_gate) {
auto gate = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, cur); // [n_head_l, n_tokens]
cb(gate, "attn_gate", il);
gate = ggml_sigmoid(ctx0, gate);
cb(gate, "attn_gate_sigmoid", il);
// reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens]
ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens);
ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
gate_3d = ggml_repeat(ctx0, gate_3d, attn_3d);
cb(gate_3d, "attn_gate_bcast", il);
attn_3d = ggml_mul(ctx0, attn_3d, gate_3d);
cb(attn_3d, "attn_gated_3d", il);
//attn_out = ggml_cont_2d(ctx0, ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens),
// n_embd_head_v * n_head_l, n_tokens);
attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens);
cb(attn_out, "attn_gated", il);
}
// output projection
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, attn_out);
cb(cur, "attn_proj", il);
}
if (il == n_layer - 1 && inp_out_ids && n_tokens > 1) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
// feed-forward
if (model.layers[il].ffn_gate_inp == nullptr) {
// dense MLP
cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr,
nullptr,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
} else {
// MoE routed experts
const bool norm_w = hparams.expert_weights_norm;
const float w_scale = hparams.expert_weights_scale;
const bool scale_w = w_scale != 0.0f;
ggml_tensor * moe_out = llm_build_moe_ffn(ctx0, lctx, cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU,
norm_w, scale_w, w_scale,
LLM_EXPERT_GATING_FUNC_SIGMOID,
cb, il, gf, false, model.layers[il].ffn_up_gate_exps);
cb(moe_out, "ffn_moe_out", il);
// shared expert MLP (always added on MoE layers in Step35)
ggml_tensor * sh_out = llm_build_ffn(ctx0, lctx, nullptr, cur,
model.layers[il].ffn_up_shexp, nullptr, nullptr,
model.layers[il].ffn_gate_shexp, nullptr, nullptr,
model.layers[il].ffn_down_shexp, nullptr, nullptr,
nullptr,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
cb(sh_out, "ffn_shared_out", il);
cur = ggml_add(ctx0, moe_out, sh_out);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out_with_inp", il);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
ggml_cgraph * llm_build_context::build_qwen() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -9360,6 +9524,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
{
result = llm.build_seedoss();
} break;
case LLM_ARCH_STEP35:
{
result = llm.build_step35();
} break;
default:
GGML_ABORT("fatal error");
}

View File

@@ -280,6 +280,8 @@ struct llm_build_context {
ggml_cgraph * build_seedoss();
ggml_cgraph * build_step35();
//
static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0,
ggml_tensor * w, ggml_tensor * cur);

View File

@@ -1115,6 +1115,38 @@ void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_STEP35:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
//hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
// MoE + SWA parameters
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
// Step35 uses sigmoid gating by default (if not set in GGUF)
if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) {
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID;
}
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_limits, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer);
// Optional: Step35-only gating for applying rope scaling (HF: yarn_only_types).
// Default is 3 (apply on all layers) if the key is absent.
//ml.get_key(format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()),
// hparams.rope_scaling_apply_mask, false);
//hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr(
// format("%s.rope.freq_base_per_layer", ml.get_arch_name().c_str()),
// hparams.rope_freq_base_per_layer, hparams.n_layer, false);
ml.get_key(format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()),
hparams.rope_scaling_apply_mask, false);
hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr(LLM_KV_ROPE_FREQ_BASE_PER_LAYER,
hparams.rope_freq_base_per_layer, hparams.n_layer, false);
//type = LLM_TYPE_UNKNOWN; <--- what is this?
} break;
default: (void)0;
}

View File

@@ -70,6 +70,8 @@ struct llama_hparams {
float rope_freq_base_train_swa;
float rope_freq_scale_train;
float rope_freq_scale_train_swa;
uint32_t rope_scaling_apply_mask = 0x3;
bool has_rope_freq_base_per_layer = false;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f;
@@ -79,6 +81,8 @@ struct llama_hparams {
float yarn_beta_slow = 1.0f;
std::array<int, 4> rope_sections;
std::array<float, LLAMA_MAX_LAYERS> rope_freq_base_per_layer;
std::array<uint32_t, LLAMA_MAX_LAYERS> rope_dim_per_layer;
// for State Space Models
uint32_t ssm_d_conv = 0;
@@ -124,6 +128,9 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
std::array<float, LLAMA_MAX_LAYERS> swiglu_limits;
std::array<float, LLAMA_MAX_LAYERS> swiglu_limits_shared;
bool operator!=(const llama_hparams & other) const {
if (this->vocab_only != other.vocab_only) return true;
if (this->n_vocab != other.n_vocab) return true;
@@ -265,6 +272,11 @@ struct llama_hparams {
return std::fabs(b - a) <= abs_tol;
}
uint32_t rope_n_rot(uint32_t il) const {
const uint32_t v = rope_dim_per_layer[il];
return v ? v : n_rot;
}
static const char * rope_scaling_type_name(llama_rope_scaling_type);
};

View File

@@ -141,6 +141,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
bool create_seedoss_tensors(const LLM_TN & tn);
bool create_step35_tensors(const LLM_TN & tn);
llama_model_loader & ml;
llama_model & model;
@@ -1026,6 +1028,74 @@ bool create_tensors_helper::create_seedoss_tensors(const LLM_TN & tn) {
return use_mmap_buffer;
}
bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) {
LOADING_PRELUDE
model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
// STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
// ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
uint32_t n_rot_max = 0;
for (int i = 0; i < n_layer; ++i) {
n_rot_max = std::max(n_rot_max, hparams.rope_n_rot(i));
}
if (n_rot_max == 0) {
n_rot_max = n_rot;
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
const uint32_t n_head_l = hparams.n_head(i);
layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED);
// optional rope factors (llama3) / longrope tensors
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
layer.rope_long = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
} else {
layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
use_mmap_buffer &= !merge_qkv(tn, i, 0);
//layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
//layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
//layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
// head-wise attention gate (Step35 self_attn.g_proj)
layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
// dense MLP (leading dense blocks)
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
// MoE routed experts + selection bias (router_bias)
const int64_t n_ff_exp = hparams.n_ff_exp;
if (!layer.ffn_gate) {
layer.ffn_gate_inp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert},
llama_model_loader::TENSOR_NOT_REQUIRED);
use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, n_ff_exp);
//layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert},
// llama_model_loader::TENSOR_NOT_REQUIRED);
//layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert},
// llama_model_loader::TENSOR_NOT_REQUIRED);
//layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert},
// llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_exp_probs_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert},
llama_model_loader::TENSOR_NOT_REQUIRED);
// shared expert MLP
layer.ffn_gate_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp},
llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_up_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp},
llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_down_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd},
llama_model_loader::TENSOR_NOT_REQUIRED);
}
}
return use_mmap_buffer;
}
bool create_tensors_helper::create_qwen_tensors(const LLM_TN & tn) {
LOADING_PRELUDE
create_embd_output(tn, n_embd, n_vocab);
@@ -2784,10 +2854,10 @@ bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & t
bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool ignore_attn_scale) {
auto& hparams = model.hparams;
const int64_t n_head = hparams.n_head();
const int64_t n_head_kv = hparams.n_head_kv();
const int64_t n_head = hparams.n_head(i);
const int64_t n_head_kv = hparams.n_head_kv(i);
const int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1); // For Qwen3-VL we need to divide by the number of deepstack layers + 1, for other models n_deepstack_layers value is 0 by default
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -3111,6 +3181,8 @@ bool create_tensors_helper::create_tensors() {
use_mmap_buffer = create_mimo2_tensors(tn); break;
case LLM_ARCH_SEED_OSS:
use_mmap_buffer = create_seedoss_tensors(tn); break;
case LLM_ARCH_STEP35:
use_mmap_buffer = create_step35_tensors(tn); break;
default:
throw std::runtime_error("unknown architecture");
}

View File

@@ -1083,6 +1083,7 @@ template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::str
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
template std::enable_if<std::is_integral<unsigned int>::value, bool>::type llama_model_loader::get_arr_n<unsigned int>(enum llm_kv, unsigned int&, bool);

View File

@@ -1334,6 +1334,37 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_STEP35,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
{ LLM_TENSOR_ROPE_FACTORS_SHORT,"rope_factors_short" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
},
},
{
LLM_ARCH_UNKNOWN,
{

View File

@@ -149,6 +149,7 @@ struct llama_layer {
struct ggml_tensor * ffn_sub_norm = nullptr;
struct ggml_tensor * attn_norm_cross = nullptr;
struct ggml_tensor * attn_norm_enc = nullptr;
struct ggml_tensor * wqkv_gate = nullptr;
// attention
struct ggml_tensor * wq = nullptr;

View File

@@ -5051,6 +5051,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MINIMAX_M2:
case LLM_ARCH_MIMO2:
case LLM_ARCH_SEED_OSS:
case LLM_ARCH_STEP35:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL: