Step-3.5-Flash support (#1231)

* WIP * This works but is slow * Turn off the up / gate clamps for now * OK we need the clamping * Fuse the clamp (CUDA) * Fuse the clamp (CPU) * WIP * Be able to use merged q, k, v * Be able to use merged up/gate experts * Fuse the clamp (CUDA mmvq)
2026-04-27 09:53:40 +00:00 · 2026-02-05 08:13:22 +02:00
parent 8d952ff183
commit 9c1c74acda
22 changed files with 487 additions and 69 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2626,12 +2626,13 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                ((ggml_backend_cuda_buffer_context *)next->buffer->context)->device == device_id;

            auto unary_op = (ggml_unary_op)dst->op_params[0];
+            float limit = *(const float *)(dst->op_params + 1);
            if (src0_2) {
                ggml_cuda_op_fused_mul_mat_vec_q_id(ctx, src0_1, &local_src1, ids, &local_dst,
                        dst->src[4], dst->src[5],
                        (const char *)src0_1->data, src0_2 ? (const char *)src0_2->data : nullptr,
                        (const float *)src1->data, src1_quantized.get(),
-                        (float *)local_dst.data, 0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, stream);
+                        (float *)local_dst.data, 0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
            } else {
                auto local_src0_1 = *src0_1;
                local_src0_1.ne[1] /= 2;
@@ -2642,7 +2643,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                            nullptr, nullptr,
                            (const char *)local_src0_1.data, (const char *)local_src0_2.data,
                            (const float *)src1->data, src1_quantized.get(),
-                            (float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, stream);
+                            (float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
                } else {
                    GGML_ASSERT(!dst->src[5]);
                    auto local_bias_1 = *dst->src[4];
@@ -2653,7 +2654,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                            &local_bias_1, &local_bias_2,
                            (const char *)local_src0_1.data, (const char *)local_src0_2.data,
                            (const float *)src1->data, src1_quantized.get(),
-                            (float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, stream);
+                            (float *)local_dst.data, 0, local_src0_1.ne[1], 1, src1_padded_col_size, unary_op, limit, stream);
                }
            }
            CUDA_CHECK(cudaGetLastError());
@@ -2773,9 +2774,11 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                        (float *)dst->data, ggml_nelements(dst), dst_row.ne[0],  dst_row.ne[0],  dst_row.ne[0],
                        1.702f, 7.0f, stream);
        } else {
+            float limit = *((const float *)(dst->op_params + 1));
+            //printf("%s: using limit = %g\n", __func__, limit);
            ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
                    (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
-                    (float *)dst->data);
+                    (float *)dst->data, limit);
        }
        } else {

@@ -2801,8 +2804,10 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                        (float *)dst->data, ggml_nelements(dst), dst->ne[0], src0_1->ne[1], src0_1->ne[1],
                        1.702f, 7.0f, stream);
            } else {
+                float limit = *((const float *)(dst->op_params + 1));
+                //printf("%s: using limit = %g\n", __func__, limit);
                ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst), dst->ne[0],
-                        (const float *)dst_up_gate_contiguous.get(), (float *)dst->data);
+                        (const float *)dst_up_gate_contiguous.get(), (float *)dst->data, limit);
            }
        }
        CUDA_CHECK(cudaGetLastError());
@@ -2970,6 +2975,8 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
        }

        auto unary_op = (ggml_unary_op)dst->op_params[0];
+        float limit = *(const float *)(dst->op_params + 1);
+        //printf("%s: using limit = %g\n", __func__, limit);
        if (src0_2) {
            dst_row.data  = dst_gate_contiguous.get();
            if (use_quantized_src1) {
@@ -2993,7 +3000,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
            } else {
                ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row),
                        (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
-                        (float *)dst_gate_contiguous.get());
+                        (float *)dst_gate_contiguous.get(), limit);
            }
        } else {
            if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
@@ -3002,7 +3009,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                        1.702f, 7.0f, stream);
            } else {
                ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(&dst_row)/2, dst->ne[0],
-                        (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get());
+                        (const float *)dst_up_contiguous.get(), (float *)dst_gate_contiguous.get(), limit);
            }
            dst_row.data = dst_gate_contiguous.get();
            dst_row.ne[0] /= 2;
@@ -3065,6 +3072,8 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor

    auto stream = ctx.stream();

+    float limit = *(const float *)(dst->op_params + 1);
+
    auto ne10_padded = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
    auto nb10_padded = ne10_padded*sizeof(block_q8_1)/QK8_1;
    auto quantized_size = nb10_padded*src1->ne[1];
@@ -3083,7 +3092,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
                    dst->src[4], dst->src[5],
                    (const char *)src0_1->data, (const char *)src0_2->data, (const float *)src1->data, src1_quantized.get(),
                    (float *)dst->data, 0, src0_1->ne[1], 1, ne10_padded,
-                    (ggml_unary_op)dst->op_params[0], stream);
+                    (ggml_unary_op)dst->op_params[0], limit, stream);
            return;
        }

@@ -3116,8 +3125,9 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
        }
    }

+    //printf("%s: using limit = %g\n", __func__, limit);
    ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
-                    (const float *)dst->data, dst_up.get(), (float *)dst->data);
+                    (const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
    CUDA_CHECK(cudaGetLastError());

 }
--- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
+++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
@@ -105,7 +105,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
    const void * __restrict__ vup, const void * __restrict__ vgate, const void * __restrict__ vy, float * __restrict__ dst,
    const float * __restrict__ bias_u, const float * __restrict__ bias_g,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size,
-    ggml_unary_op unary_op) {
+    ggml_unary_op unary_op, float limit) {

    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
@@ -191,7 +191,12 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
            float g = tmp_g[j][threadIdx.x];
            float r;
            switch (unary_op) {
-                case GGML_UNARY_OP_SILU: r = u*g/(1 + expf(-g)); break;
+                case GGML_UNARY_OP_SILU:
+                    {
+                        g = g/(1 + expf(-g));
+                        g = min(g, limit);
+                        r = max(-limit, min(limit, u))*g;
+                    } break;
                case GGML_UNARY_OP_RELU: r = fmaxf(g, 0.0f) * u; break;
                case GGML_UNARY_OP_GELU: {
                    constexpr float GELU_COEF_A    = 0.044715f;
@@ -243,7 +248,7 @@ static __global__ void iqk_fused_mul_mat_vec_q(
    const void * __restrict__ vx_u, const void * __restrict__ vx_g, const void * __restrict__ vy, float * __restrict__ dst,
    const char * __restrict__ ids_data, const void * __restrict__ bias_u, const void * __restrict__ bias_g, const uint64_t bias_nb1,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size,
-    const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op) {
+    const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op, float limit) {

    int i2 = blockIdx.y;
    int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
@@ -256,7 +261,7 @@ static __global__ void iqk_fused_mul_mat_vec_q(
    char * cdst = (char *)dst + i2*nb2;
    iqk_fused_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(
            cx_u, cx_g, cy, (float *)cdst, cx_u_b, cx_g_b,
-            ncols_x, nrows_x, nrows_y, nrows_dst, row_size, unary_op);
+            ncols_x, nrows_x, nrows_y, nrows_dst, row_size, unary_op, limit);
 }

 template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int n_interleaved = 1>
@@ -307,56 +312,56 @@ static void iqk_mul_mat_vec_q_cuda(const mmvq_args & args, cudaStream_t stream)
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        case 2:
            iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 2, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        case 3:
            iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 3, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        case 4:
            iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 4, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        case 5:
            iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 5, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        case 6:
            iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 6, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        case 7:
            iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 7, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        case 8:
            iqk_fused_mul_mat_vec_q<type, vdr, vec_dot_q_cuda, 8, n_interleaved><<<block_nums, block_dims, 0, stream>>>(
                    args.vx_u, args.vx_g, args.vy, args.dst,
                    args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, row_size,
-                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op, args.limit);
            break;
        default:
            GGML_ASSERT(false);
--- a/ggml/src/ggml-cuda/mmvq-args.h
+++ b/ggml/src/ggml-cuda/mmvq-args.h
@@ -22,5 +22,6 @@ struct mmvq_args {
    const uint64_t ids_nb0;
    const uint64_t bias_nb1;
    ggml_unary_op  unary_op;
+    float          limit;
 };

--- a/ggml/src/ggml-cuda/mmvq-templates.cuh
+++ b/ggml/src/ggml-cuda/mmvq-templates.cuh
@@ -154,7 +154,7 @@ static __device__ void fused_mul_mat_vec_q(
    const void * __restrict__ vup, const void * __restrict__ vgate,
    const float * __restrict__ bias_u, const float * __restrict__ bias_g,
    const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, ggml_unary_op unary_op) {
+    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, ggml_unary_op unary_op, float limit) {

    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
@@ -243,7 +243,12 @@ static __device__ void fused_mul_mat_vec_q(
            float g = tmp_g[j][threadIdx.x];
            float r;
            switch (unary_op) {
-                case GGML_UNARY_OP_SILU: r = u*g/(1 + expf(-g)); break;
+                case GGML_UNARY_OP_SILU:
+                    {
+                        g = g/(1 + expf(-g));
+                        g = min(g, limit);
+                        r = max(-limit, min(limit, u))*g;
+                    }break;
                case GGML_UNARY_OP_RELU: r = fmaxf(g, 0.0f) * u; break;
                case GGML_UNARY_OP_GELU: {
                    constexpr float GELU_COEF_A    = 0.044715f;
@@ -299,7 +304,7 @@ static __global__ void fused_mul_mat_vec_q(
    const void * __restrict__ vy, float * __restrict__ dst, const char * __restrict__ ids_data,
    const void * __restrict__ bias_u, const void * __restrict__ bias_g, const uint64_t bias_nb1,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst,
-    const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op) {
+    const uint64_t nb02, const uint64_t nb12, const uint64_t nb2, const int64_t ids_nb0, ggml_unary_op unary_op, float limit) {

    int i2 = blockIdx.y;
    char * cdst = (char *)dst + i2*nb2;
@@ -312,7 +317,8 @@ static __global__ void fused_mul_mat_vec_q(
    const float * cx_u_b = bias_u ? (const float *)((const char *)bias_u + i02*bias_nb1) : nullptr;
    const float * cx_g_b = bias_g ? (const float *)((const char *)bias_g + i02*bias_nb1) : nullptr;
    const char * cy = (const char *)vy + i2*nb12;
-    fused_mul_mat_vec_q<type, ncols_y, nwarps>(cx_u, cx_g, cx_u_b, cx_g_b, cy, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, unary_op);
+    fused_mul_mat_vec_q<type, ncols_y, nwarps>(cx_u, cx_g, cx_u_b, cx_g_b, cy, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst,
+            unary_op, limit);
 }

 template <ggml_type type, int nwarps>
@@ -335,42 +341,50 @@ static void mul_mat_vec_q_cuda_T(const mmvq_args & args, cudaStream_t stream) {
        case 1:
            fused_mul_mat_vec_q<type, 1, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        case 2:
            fused_mul_mat_vec_q<type, 2, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        case 3:
            fused_mul_mat_vec_q<type, 3, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        case 4:
            fused_mul_mat_vec_q<type, 4, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        case 5:
            fused_mul_mat_vec_q<type, 5, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        case 6:
            fused_mul_mat_vec_q<type, 6, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        case 7:
            fused_mul_mat_vec_q<type, 7, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        case 8:
            fused_mul_mat_vec_q<type, 8, nwarps><<<block_nums, block_dims, 0, stream>>>(args.vx_u, args.vx_g, args.vy,
                    args.dst, args.ids_data, args.bias_u, args.bias_g, args.bias_nb1,
-                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0, args.unary_op);
+                    args.ncols_x, args.nrows_x, args.nrows_y, args.nrows_dst, args.nb02, args.nb12, args.nb2, args.ids_nb0,
+                    args.unary_op, args.limit);
            break;
        default:
            GGML_ABORT("fatal error");
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -13,7 +13,7 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm
        const char * src0_dd_u, const char * src0_dd_g, const char * src1_ddq_i, float * dst_dd_i, const char * ids_data,
        const void * bias_u, const void * bias_g,
        const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-        const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream) {
+        const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream) {

    const int64_t row_diff = row_high - row_low;

@@ -41,7 +41,8 @@ static void ggml_cuda_op_mul_mat_vec_q_impl(ggml_backend_cuda_context & ctx, ggm
                   /* nb2      */ uint64_t(nb2),
                   /* ids_nb0  */ uint64_t(ids_nb0),
                   /* bias_nb1 */ uint64_t(bias_nb1),
-                   /* unary_op */ unary_op
+                   /* unary_op */ unary_op,
+                   /* limit    */ limit > 1e-6f ? limit : INFINITY
    };

    switch (type) {
@@ -163,7 +164,7 @@ void ggml_cuda_op_mul_mat_vec_q_3D(
        src0->nb[2], src1_row_size, dst->nb[2], 0, 0,
        src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, nullptr, nullptr, nullptr,
        row_low, row_high, src1_ncols,
-        src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
+        src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);

    GGML_UNUSED(src1_ddf_i);
 }
@@ -199,7 +200,7 @@ void ggml_cuda_op_mul_mat_vec_q_biased(
        ne00, ne0, 1, 0, 0, 0, 0, 0,
        src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, nullptr, bias ? bias->data : nullptr, nullptr,
        row_low, row_high, src1_ncols,
-        src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
+        src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);

    GGML_UNUSED(src1_ddf_i);
 }
@@ -246,7 +247,7 @@ void ggml_cuda_op_mul_mat_vec_q_id(
        src0->nb[2], src1->nb[2], dst->nb[2], ids->nb[0], bias ? bias->nb[1] : 0,
        src0_dd_i, nullptr, src1_ddq_i, dst_dd_i, (const char *)ids->data, bias ? bias->data : nullptr, nullptr,
        row_low, row_high, src1_ncols,
-        src1_padded_row_size, GGML_UNARY_OP_COUNT, stream);
+        src1_padded_row_size, GGML_UNARY_OP_COUNT, 0.0f, stream);

    GGML_UNUSED(src1_ddf_i);
 }
@@ -256,7 +257,7 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
    const ggml_tensor * bias_u, const ggml_tensor * bias_g,
    const char * src0_dd_u, const char * src0_dd_g, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream) {
+    const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream) {

    if (!bias_u && !bias_g) {
        GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU ||
@@ -294,7 +295,7 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
        src0_dd_u, src0_dd_g, src1_ddq_i, dst_dd_i, ids ? (const char *)ids->data : nullptr,
        bias_u ? bias_u->data : nullptr, bias_g ? bias_g->data : nullptr,
        row_low, row_high, src1_ncols,
-        src1_padded_row_size, unary_op, stream);
+        src1_padded_row_size, unary_op, limit, stream);

    GGML_UNUSED(src1_ddf_i);
 }
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -40,4 +40,4 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
    const ggml_tensor * bias_u, const ggml_tensor * bias_g,
    const char * src0_dd_u, const char * src0_dd_g, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, ggml_unary_op unary_op, cudaStream_t stream);
+    const int64_t src1_padded_row_size, ggml_unary_op unary_op, float limit, cudaStream_t stream);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -61,6 +61,17 @@ static __global__ void fused_mul_silu_f32(const float * x, const float * y, floa
    dst[i] = x[i] * y[i] / (1.0f + expf(-x[i]));
 }

+static __global__ void fused_mul_silu_f32(const float * x, const float * y, float * dst, const int k, float limit) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    float g = x[i] / (1.0f + expf(-x[i]));
+    g = min(g, limit);
+    dst[i] = g * max(-limit, min(limit, y[i]));
+}
+
 static __global__ void fused_mul_silu_f32(const float * x, float * dst, const int k, const int ne0) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

@@ -73,6 +84,20 @@ static __global__ void fused_mul_silu_f32(const float * x, float * dst, const in
    dst[i] = x_row[j] * x_row[j + ne0] / (1.0f + expf(-x_row[j + ne0]));
 }

+static __global__ void fused_mul_silu_f32(const float * x, float * dst, const int k, const int ne0, float limit) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    int row = i / ne0;
+    int j   = i % ne0;
+    auto x_row = x + 2*row*ne0;
+    float g = x_row[j + ne0] / (1.0f + expf(-x_row[j + ne0]));
+    g = min(g, limit);
+    dst[i] = max(-limit, min(limit, x_row[j])) * g;
+}
+
 static __global__ void fused_mul_relu_f32(const float * x, const float * y, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

@@ -223,9 +248,13 @@ static void swiglu_f32_cuda(const float * x, float * dst, const int k, const int
 }
 #endif

-static void fused_mul_silu_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
+static void fused_mul_silu_f32_cuda(const float * x, const float * y, float * dst, const int k, float limit, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
-    fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+    if (limit < 1e-6f) {
+        fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+    } else {
+        fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k, limit);
+    }
 }

 static void fused_mul_relu_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
@@ -238,9 +267,13 @@ static void fused_mul_gelu_f32_cuda(const float * x, const float * y, float * ds
    fused_mul_gelu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 }

-static void fused_mul_silu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
+static void fused_mul_silu_f32_cuda(const float * x, float * dst, const int k, const int ne0, float limit, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
-    fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
+    if (limit < 1e-6f) {
+        fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
+    } else {
+        fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0, limit);
+    }
 }

 static void fused_mul_relu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
@@ -344,12 +377,12 @@ void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 #endif

 void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
-        int64_t nelements, const float * src0_d, const float * src1_d, float * dst_d) {
+        int64_t nelements, const float * src0_d, const float * src1_d, float * dst_d, float limit) {

    cudaStream_t stream = ctx.stream();

    switch (op) {
-        case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
+        case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, nelements, limit, stream); break;
        case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
        case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, src1_d, dst_d, nelements, stream); break;
        default: GGML_ASSERT(false);
@@ -357,12 +390,12 @@ void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
 }

 void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
-        int64_t nelements, int64_t ne0, const float * src0_d, float * dst_d) {
+        int64_t nelements, int64_t ne0, const float * src0_d, float * dst_d, float limit) {

    cudaStream_t stream = ctx.stream();

    switch (op) {
-        case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
+        case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, dst_d, nelements, ne0, limit, stream); break;
        case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
        case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
        default: GGML_ASSERT(false);
@@ -373,15 +406,16 @@ void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor *
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    ggml_unary_op op = (ggml_unary_op)dst->op_params[0];
+    float limit = *(const float *)(dst->op_params + 1);
    GGML_ASSERT(ggml_is_contiguous(src0));

    if (src1) {
        GGML_ASSERT(ggml_are_same_shape(src0, dst));
        GGML_ASSERT(ggml_are_same_shape(src0, src1));
-        ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data);
+        ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data, limit);
    } else {
        GGML_ASSERT(src0->ne[0] == 2*dst->ne[0] && src0->ne[1] == dst->ne[1] && src0->ne[2] == dst->ne[2] && src0->ne[3] == dst->ne[3]);
-        ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), dst->ne[0], (const float *)src0->data, (float *)dst->data);
+        ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), dst->ne[0], (const float *)src0->data, (float *)dst->data, limit);
    }
 }

--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -87,9 +87,9 @@ void ggml_swiglu_oai_cuda_f32(const float * x, const float * g, float * dst, con
 void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
-        int64_t nelements, const float * x, const float * y, float * z);
+        int64_t nelements, const float * x, const float * y, float * z, float limit = 0);

 void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
-        int64_t nelements,int64_t ne0,  const float * x, float * z);
+        int64_t nelements,int64_t ne0,  const float * x, float * z, float limit = 0);

 void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -15140,6 +15140,8 @@ static void ggml_compute_forward_fused_mul_unary_f32(
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    enum ggml_unary_op op = (enum ggml_unary_op)dst->op_params[0];
+    const float limit = *(const float *)(dst->op_params + 1);
+    if (params->ith == 0) printf("%s(%s) using limit = %g\n", __func__, dst->name, (double)limit);

    GGML_ASSERT(ggml_is_contiguous_1(src0));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
@@ -15167,7 +15169,19 @@ static void ggml_compute_forward_fused_mul_unary_f32(
        switch (op) {
            case GGML_UNARY_OP_GELU: ggml_vec_gelu_f32(nc, z, x); ggml_vec_mul_f32(nc, z, z, y); break;
            case GGML_UNARY_OP_RELU: ggml_vec_relu_f32(nc, z, x); ggml_vec_mul_f32(nc, z, z, y); break;
-            case GGML_UNARY_OP_SILU: ggml_vec_mul_silu_f32(nc, z, x, y); break;
+            case GGML_UNARY_OP_SILU: {
+                if (limit < 1e-6f) {
+                    ggml_vec_mul_silu_f32(nc, z, x, y);
+                } else {
+                    // TODO: simdify this
+                    for (int i = 0; i < nc; ++i) {
+                        float gate = ggml_silu_f32(x[i]);
+                        gate = MIN(gate, limit);
+                        float up = MAX(-limit, MIN(limit, y[i]));
+                        z[i] = up * gate;
+                    }
+                }
+            } break;
            default: GGML_ABORT("fatal error");
        }
    }
@@ -16673,6 +16687,7 @@ static void ggml_compute_forward_mul_mat_id_up_gate(

    ggml_barrier(params->shared);

+    const float limit = *(const float *)(dst->op_params + 1);

    // so GGML_TENSOR_BINARY_OP_LOCALS works

@@ -16704,7 +16719,7 @@ static void ggml_compute_forward_mul_mat_id_up_gate(
                            vec_dot_type, (const char *)wdata, row_size,
                            up_b_cur, gate_b_cur,
                            (float *)dst->data, nb1, nb2,
-                            matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error");
+                            matrix_rows + cur_a*ne12, limit, ith, nth)) GGML_ABORT("fatal error");

    }

@@ -16769,6 +16784,8 @@ static void ggml_compute_forward_mul_mat_up_gate(

    ggml_barrier(params->shared);

+    float limit = *(const float *)(dst->op_params + 1);
+
    const size_t row_size = ggml_row_size(vec_dot_type, ne10);

    if (!iqk_moe_fused_up_gate(ne01, ne11, ne00, ne11, dst->op_params[0],
@@ -16776,7 +16793,7 @@ static void ggml_compute_forward_mul_mat_up_gate(
                         vec_dot_type, (const char *)wdata, row_size,
                         NULL, NULL,
                         (float *)dst->data, nb1, nb2,
-                         NULL, ith, nth)) GGML_ABORT("fatal error");
+                         NULL, limit, ith, nth)) GGML_ABORT("fatal error");

 }
 #endif
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -135,7 +135,7 @@ struct MulMat {
    }
    inline void mul_mat_up_gate_NxM(int n, const void * vx_up, const void * vx_gate, size_t bx,
            const float * up_b, const float * gate_b,
-            DataInfo& info, int nrc_x, int nrc_y, int unary_op) {
+            DataInfo& info, int nrc_x, int nrc_y, int unary_op, float limit) {
 #ifdef __aarch64__
        constexpr int k_x_step = 64; //8192; // Tiling does not seem to help on my M2 Max (but difference to tiling is small)
 #else
@@ -143,7 +143,7 @@ struct MulMat {
 #endif
        auto op = ggml_unary_op(unary_op);
        float tmp[k_x_step*16];
-        auto process = [&tmp, n, op, vx_gate, vx_up, gate_b, up_b, bx, xstep = k_x_step] (mul_mat_t func, const DataInfo& this_info, int ix, int this_nrc_x, int ny) {
+        auto process = [&tmp, n, op, vx_gate, vx_up, gate_b, up_b, bx, xstep = k_x_step, limit] (mul_mat_t func, const DataInfo& this_info, int ix, int this_nrc_x, int ny) {
            func(n, (const void *)((const char *)vx_gate + ix*bx), bx, this_info, this_nrc_x);
            for (int ky = 0; ky < ny; ++ky) {
                if (gate_b) {
@@ -152,6 +152,9 @@ struct MulMat {
                    for (int j = 0; j < this_nrc_x; ++j) x[j] += b[j];
                }
                activate(op, this_nrc_x, this_info.dst_row(ky), tmp + ky*xstep);
+                if (limit > 1e-6f) {
+                    for (int j = 0; j < this_nrc_x; ++j) tmp[ky*xstep + j] = std::min(tmp[ky*xstep + j], limit);
+                }
            }
            func(n, (const void *)((const char *)vx_up + ix*bx), bx, this_info, this_nrc_x);
            for (int ky = 0; ky < ny; ++ky) {
@@ -162,6 +165,8 @@ struct MulMat {
                }
                if (op == GGML_UNARY_OP_SWIGLU_OAI) {
                    clamp_oai(this_nrc_x, result);
+                } else if (limit > 1e-6f) {
+                    for (int j = 0; j < this_nrc_x; ++j) result[j] = std::max(-limit, std::min(limit, result[j]));
                }
                for (int j = 0; j < this_nrc_x; ++j) result[j] *= tmp[ky*xstep + j];
            }
@@ -738,7 +743,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
        int typeA, const void * Aup, const void * Agate, long strideA,
        int typeB, const void * B, long strideB,
        const char * up_b_c, const char * gate_b_c,
-        float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
+        float * C, long nb1, long nb2, const void * vrow_mapping, float limit, int ith, int nth) {

    const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
    //assert(row_mapping != nullptr);
@@ -781,7 +786,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
                }
                auto up_b   = up_b_c   ? (const float *)up_b_c + first_x + ix : nullptr;
                auto gate_b = gate_b_c ? (const float *)gate_b_c + first_x + ix : nullptr;
-                mm.mul_mat_up_gate_NxM(ne00, Xu, Xg, row_size_qx, up_b, gate_b, this_info, this_nrc_x, Ny, unary_op);
+                mm.mul_mat_up_gate_NxM(ne00, Xu, Xg, row_size_qx, up_b, gate_b, this_info, this_nrc_x, Ny, unary_op, limit);
            }

            return true;
@@ -806,7 +811,7 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int n
    auto up_b   = up_b_c   ? (const float *)up_b_c + first_x : nullptr;
    auto gate_b = gate_b_c ? (const float *)gate_b_c + first_x : nullptr;
    mm.mul_mat_up_gate_NxM(ne00, (const char *)Aup + row_size_qx*first_x, (const char *)Agate + row_size_qx*first_x, row_size_qx,
-            up_b, gate_b, info, nrc_x, Ny, unary_op);
+            up_b, gate_b, info, nrc_x, Ny, unary_op, limit);
    return true;
 }

@@ -1406,7 +1411,7 @@ extern "C" IQK_API bool iqk_mul_mat_moe(long, long, long, int, int, const void *
 extern "C" IQK_API bool iqk_moe_fused_up_gate(long /*Nx*/, long /*Ny*/, long /*ne00*/, int /*ne11*/, int /*unary_op*/,
        int /*typeA*/, const void * /*Aup*/, const void * /*Agate*/, long /*strideA*/,
        int /*typeB*/, const void * /*B*/, long /*strideB*/,
-        float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, int /*ith*/, int /*nth*/) {
+        float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, float, int /*ith*/, int /*nth*/) {
    GGML_ABORT("Unsupported CPU. You may need to manually set compilation flags\n");
    return false;
 }
--- a/ggml/src/iqk/iqk_mul_mat.h
+++ b/ggml/src/iqk/iqk_mul_mat.h
@@ -34,7 +34,7 @@ IQK_API bool iqk_moe_fused_up_gate(long Nx, long Ny, long ne00, int ne11, int un
        int typeA, const void * Aup, const void * Agate, long strideA,
        int typeB, const void * B, long strideB,
        const char * up_b, const char * gate_b,
-        float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth);
+        float * C, long nb1, long nb2, const void * vrow_mapping, float limit, int ith, int nth);

 IQK_API int iqk_dequant_type(int type, int Ny);

--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -71,6 +71,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_MISTRAL3,        "mistral3"     },
    { LLM_ARCH_MIMO2,           "mimo2"        },
    { LLM_ARCH_SEED_OSS,        "seed_oss"     },
+    { LLM_ARCH_STEP35,          "step35"       },
    { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };

@@ -128,6 +129,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
    { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
    { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },
+    { LLM_KV_SWIGLU_LIMITS,                     "%s.swiglu_limits"                     },
+    { LLM_KV_SWIGLU_LIMITS_SHARED,              "%s.swiglu_limits_shared"              },

    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
@@ -151,8 +154,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,       "%s.attention.value_length_mla"       },

    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,"%s.rope.dimension_count_per_layer"       },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,       "%s.rope.dimension_sections"              },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
+    { LLM_KV_ROPE_FREQ_BASE_PER_LAYER,      "%s.rope.freq_base_per_layer"             },
    { LLM_KV_ROPE_FREQ_BASE_SWA,            "%s.rope.freq_base_swa"                   },
    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -70,6 +70,7 @@ enum llm_arch {
    LLM_ARCH_MISTRAL3,
    LLM_ARCH_MIMO2,
    LLM_ARCH_SEED_OSS,
+    LLM_ARCH_STEP35,
    LLM_ARCH_UNKNOWN,
 };

@@ -121,6 +122,8 @@ enum llm_kv {
    LLM_KV_EMBEDDING_SCALE,
    LLM_KV_TOKEN_SHIFT_COUNT,
    LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
+    LLM_KV_SWIGLU_LIMITS,
+    LLM_KV_SWIGLU_LIMITS_SHARED,

    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -144,9 +147,11 @@ enum llm_kv {
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,

    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_FREQ_BASE_SWA,
+    LLM_KV_ROPE_FREQ_BASE_PER_LAYER,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
    LLM_KV_ROPE_SCALING_FACTOR,
@@ -238,6 +243,7 @@ enum llm_tensor {
    LLM_TENSOR_ATTN_POST_NORM,
    LLM_TENSOR_ATTN_ROT_EMBD,
    LLM_TENSOR_ATTN_SINKS,
+    LLM_TENSOR_ATTN_GATE,
    LLM_TENSOR_FFN_GATE_INP,
    LLM_TENSOR_FFN_GATE_INP_SHEXP,
    LLM_TENSOR_FFN_NORM,
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -755,6 +755,9 @@ ggml_tensor * llm_build_context::llm_build_ffn(
                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
        cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
        cb(cur, "ffn_up_gate", il);
+        if (lctx.model.arch == LLM_ARCH_STEP35) {
+            *(float *)(cur->op_params + 1) = lctx.model.hparams.swiglu_limits_shared[il];
+        }
        if (down) {
            cur = llm_build_lora_mm(lctx, ctx, down, cur);
            if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) {
@@ -828,12 +831,21 @@ ggml_tensor * llm_build_context::llm_build_ffn(
       (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
        cur = ggml_fused_mul_unary(ctx, cur, tmp, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
                                                  type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU);
+        if (lctx.model.arch == LLM_ARCH_STEP35) {
+            *((float *)(cur->op_params + 1)) = lctx.model.hparams.swiglu_limits_shared[il];
+        }
    }
    else {

    switch (type_op) {
        case LLM_FFN_SILU:
            {
+                if (lctx.model.arch == LLM_ARCH_STEP35) {
+                    cur = ggml_fused_mul_unary(ctx, cur, up, GGML_UNARY_OP_SILU);
+                    *(float *)(cur->op_params + 1) = lctx.model.hparams.swiglu_limits_shared[il];
+                    type_gate = LLM_FFN_SEQ;
+                    break;
+                }
                cur = ggml_silu(ctx, cur);
                cb(cur, "ffn_silu", il);
            } break;
@@ -1003,7 +1015,7 @@ llm_expert_gating_func_type   gating_op,
        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
        cb(weights_sum, "ffn_moe_weights_sum", il);

-        if (lctx.model.arch == LLM_ARCH_BAILINGMOE2) {
+        if (lctx.model.arch == LLM_ARCH_BAILINGMOE2 || lctx.model.arch == LLM_ARCH_STEP35) {
            weights_sum = ggml_scale_bias(ctx, weights_sum, 1.0, 1e-20);
            cb(weights_sum, "ffn_moe_weights_sum_biased", il);
        }
@@ -1036,7 +1048,7 @@ llm_expert_gating_func_type   gating_op,
    // Hence, if we have biases, we cannot use fmoe.
    //
    //bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU);
-    bool can_use_fmoe = type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE;
+    bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE);

    ggml_tensor * par;
    if (can_use_fmoe && up_gate_exps) {
@@ -1049,6 +1061,9 @@ llm_expert_gating_func_type   gating_op,
            par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
        }
+        if (lctx.model.arch == LLM_ARCH_STEP35) {
+            *((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
+        }
    } else {
    GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);

@@ -1062,6 +1077,9 @@ llm_expert_gating_func_type   gating_op,
            par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
        }
+        if (lctx.model.arch == LLM_ARCH_STEP35) {
+            *(float *)(par->op_params + 1) = lctx.model.hparams.swiglu_limits[il];
+        }
    } else {
        ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
        cb(up, "ffn_moe_up", il);
@@ -1087,6 +1105,9 @@ llm_expert_gating_func_type   gating_op,

        if (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU) {
            par = ggml_fused_mul_unary(ctx, gate, up, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
+            if (lctx.model.arch == LLM_ARCH_STEP35) {
+                *((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
+            }
        } else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) {
            constexpr float alpha = 1.702f;
            constexpr float limit = 7.0f;
@@ -1655,8 +1676,10 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
            ggml_tensor * wk, ggml_tensor * bk,
            ggml_tensor * wv, ggml_tensor * bv,
            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, bool add_graph_split) const {
+    int n_head    = hparams.n_head(il);
+    int n_head_kv = hparams.n_head_kv(il);
    const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa(il);
    if (wqkv) {
        auto qkv = llm_build_lora_mm(lctx, ctx0, wqkv, cur);
        if (add_graph_split) {
@@ -3555,6 +3578,147 @@ ggml_cgraph * llm_build_context::build_seedoss() {
    return gf;
 }

+ggml_cgraph * llm_build_context::build_step35() {
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+    ggml_tensor * cur;
+    auto inpL        = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+    auto inp_pos     = build_inp_pos();
+    auto inp_out_ids = build_inp_out_ids();
+    auto KQ_mask     = build_inp_KQ_mask();
+    auto KQ_mask_swa = build_inp_KQ_mask_swa();
+    //const float kq_scale = 1.0f / sqrtf(float(n_rot));
+
+    for (int il = 0; il < n_layer; ++il) {
+        bool is_swa = hparams.swa_layers[il];
+        ggml_tensor * inpSA = inpL;
+        const uint32_t n_head_l    = hparams.n_head(il);
+        const float freq_base_l  = hparams.has_rope_freq_base_per_layer ? hparams.rope_freq_base_per_layer[il] :
+            is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
+        const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+        cur = inpL;
+        // self-attention
+        {
+            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur,
+                    model.layers[il].wqkv, model.layers[il].bqkv,
+                    model.layers[il].wqk,  model.layers[il].bqk,
+                    model.layers[il].wq, model.layers[il].bq,
+                    model.layers[il].wk, model.layers[il].bk,
+                    model.layers[il].wv, model.layers[il].bv,
+                    model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0.f, il);
+
+            ggml_tensor * rope_factors = nullptr;
+            const uint32_t apply_mask = hparams.rope_scaling_apply_mask;
+            if ((is_swa && (apply_mask & 0x2)) || (!is_swa && (apply_mask & 0x1))) {
+                rope_factors = build_rope_factors(il);
+            }
+            const int64_t n_rot_l = hparams.rope_n_rot(il);
+            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
+                n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
+
+            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
+                n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(Qcur, "Qcur_pos", il);
+            cb(Kcur, "Kcur_pos", il);
+
+            const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k));
+            auto attn_out = llm_build_kv(ctx0, lctx, kv_self, gf, nullptr, nullptr, // i.e., do not multiply with wo
+                    Kcur, Vcur, Qcur, is_swa ? KQ_mask_swa : KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il,
+                    nullptr, is_swa ? hparams.n_swa : 0);
+            cb(attn_out, "attn_out", il);
+
+            // head-wise attention gate: sigmoid(g_proj(x)) in torch
+            if (model.layers[il].wqkv_gate) {
+                auto gate = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, cur); // [n_head_l, n_tokens]
+                cb(gate, "attn_gate", il);
+                gate = ggml_sigmoid(ctx0, gate);
+                cb(gate, "attn_gate_sigmoid", il);
+                // reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens]
+                ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens);
+                ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate,       1,          n_head_l, n_tokens);
+                gate_3d = ggml_repeat(ctx0, gate_3d, attn_3d);
+                cb(gate_3d, "attn_gate_bcast", il);
+                attn_3d = ggml_mul(ctx0, attn_3d, gate_3d);
+                cb(attn_3d, "attn_gated_3d", il);
+                //attn_out = ggml_cont_2d(ctx0, ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens),
+                //                        n_embd_head_v * n_head_l, n_tokens);
+                attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens);
+                cb(attn_out, "attn_gated", il);
+            }
+            // output projection
+            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, attn_out);
+            cb(cur, "attn_proj", il);
+        }
+        if (il == n_layer - 1 && inp_out_ids && n_tokens > 1) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
+        cb(cur, "ffn_norm", il);
+        // feed-forward
+        if (model.layers[il].ffn_gate_inp == nullptr) {
+            // dense MLP
+            cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   nullptr,
+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr,
+                    nullptr,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // MoE routed experts
+            const bool  norm_w  = hparams.expert_weights_norm;
+            const float w_scale = hparams.expert_weights_scale;
+            const bool  scale_w = w_scale != 0.0f;
+            ggml_tensor * moe_out = llm_build_moe_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    model.layers[il].ffn_exp_probs_b,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU,
+                    norm_w, scale_w, w_scale,
+                    LLM_EXPERT_GATING_FUNC_SIGMOID,
+                    cb, il, gf, false, model.layers[il].ffn_up_gate_exps);
+            cb(moe_out, "ffn_moe_out", il);
+            // shared expert MLP (always added on MoE layers in Step35)
+            ggml_tensor * sh_out = llm_build_ffn(ctx0, lctx, nullptr, cur,
+                    model.layers[il].ffn_up_shexp,   nullptr, nullptr,
+                    model.layers[il].ffn_gate_shexp, nullptr, nullptr,
+                    model.layers[il].ffn_down_shexp, nullptr, nullptr,
+                    nullptr,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
+            cb(sh_out, "ffn_shared_out", il);
+            cur = ggml_add(ctx0, moe_out, sh_out);
+            cb(cur, "ffn_out", il);
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "ffn_out_with_inp", il);
+
+        cur = lctx.cvec.apply_to(ctx0, cur, il);
+        cb(cur, "l_out", il);
+
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
+    cb(cur, "result_norm", -1);
+
+    // lm_head
+    cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+    cb(cur, "result_output", -1);
+
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}

 ggml_cgraph * llm_build_context::build_qwen() {
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -9360,6 +9524,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
            {
                result = llm.build_seedoss();
            } break;
+        case LLM_ARCH_STEP35:
+            {
+                result = llm.build_step35();
+            } break;
        default:
            GGML_ABORT("fatal error");
    }
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -280,6 +280,8 @@ struct llm_build_context {

    ggml_cgraph * build_seedoss();

+    ggml_cgraph * build_step35();
+
    //
    static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0,
            ggml_tensor * w, ggml_tensor * cur);
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -1115,6 +1115,38 @@ void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_STEP35:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                //hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                // MoE + SWA parameters
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
+                // Step35 uses sigmoid gating by default (if not set in GGUF)
+                if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) {
+                    hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID;
+                }
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
+                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer);
+                ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS,        hparams.swiglu_limits,        hparams.n_layer);
+                ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer);
+                // Optional: Step35-only gating for applying rope scaling (HF: yarn_only_types).
+                // Default is 3 (apply on all layers) if the key is absent.
+                //ml.get_key(format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()),
+                //    hparams.rope_scaling_apply_mask, false);
+                //hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr(
+                //    format("%s.rope.freq_base_per_layer", ml.get_arch_name().c_str()),
+                //    hparams.rope_freq_base_per_layer, hparams.n_layer, false);
+                ml.get_key(format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()),
+                    hparams.rope_scaling_apply_mask, false);
+                hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr(LLM_KV_ROPE_FREQ_BASE_PER_LAYER,
+                    hparams.rope_freq_base_per_layer, hparams.n_layer, false);
+                //type = LLM_TYPE_UNKNOWN; <--- what is this?
+            } break;
        default: (void)0;
    }

--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -70,6 +70,8 @@ struct llama_hparams {
    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
    float    rope_freq_scale_train_swa;
+    uint32_t rope_scaling_apply_mask = 0x3;
+    bool     has_rope_freq_base_per_layer = false;
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;

@@ -79,6 +81,8 @@ struct llama_hparams {
    float    yarn_beta_slow   =  1.0f;

    std::array<int, 4> rope_sections;
+    std::array<float,    LLAMA_MAX_LAYERS> rope_freq_base_per_layer;
+    std::array<uint32_t, LLAMA_MAX_LAYERS> rope_dim_per_layer;

    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@@ -124,6 +128,9 @@ struct llama_hparams {

    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;

+    std::array<float, LLAMA_MAX_LAYERS> swiglu_limits;
+    std::array<float, LLAMA_MAX_LAYERS> swiglu_limits_shared;
+
    bool operator!=(const llama_hparams & other) const {
        if (this->vocab_only    != other.vocab_only)    return true;
        if (this->n_vocab       != other.n_vocab)       return true;
@@ -265,6 +272,11 @@ struct llama_hparams {
        return std::fabs(b - a) <= abs_tol;
    }

+    uint32_t rope_n_rot(uint32_t il) const {
+        const uint32_t v = rope_dim_per_layer[il];
+        return v ? v : n_rot;
+    }
+
    static const char * rope_scaling_type_name(llama_rope_scaling_type);

 };
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -141,6 +141,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {

    bool create_seedoss_tensors(const LLM_TN & tn);

+    bool create_step35_tensors(const LLM_TN & tn);
+
    llama_model_loader & ml;
    llama_model        & model;

@@ -1026,6 +1028,74 @@ bool create_tensors_helper::create_seedoss_tensors(const LLM_TN & tn) {
    return use_mmap_buffer;
 }

+bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) {
+    LOADING_PRELUDE
+
+    model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    // output
+    model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    model.output      = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+    // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
+    // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
+    uint32_t n_rot_max = 0;
+    for (int i = 0; i < n_layer; ++i) {
+        n_rot_max = std::max(n_rot_max, hparams.rope_n_rot(i));
+    }
+    if (n_rot_max == 0) {
+        n_rot_max = n_rot;
+    }
+    for (int i = 0; i < n_layer; ++i) {
+        ggml_context * ctx_split = ctx_for_layer_split(i);
+        auto & layer = model.layers[i];
+        const uint32_t n_head_l      = hparams.n_head(i);
+        layer.attn_norm   = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        // optional rope factors (llama3) / longrope tensors
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+        } else {
+            layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+        }
+        use_mmap_buffer &= !merge_qkv(tn, i, 0);
+        //layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
+        //layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        //layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+        layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
+        // head-wise attention gate (Step35 self_attn.g_proj)
+        layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        // dense MLP (leading dense blocks)
+        layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        layer.ffn_up   = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        // MoE routed experts + selection bias (router_bias)
+        const int64_t n_ff_exp = hparams.n_ff_exp;
+        if (!layer.ffn_gate) {
+            layer.ffn_gate_inp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},
+                    llama_model_loader::TENSOR_NOT_REQUIRED);
+            use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, n_ff_exp);
+            //layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert},
+            //        llama_model_loader::TENSOR_NOT_REQUIRED);
+            //layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert},
+            //        llama_model_loader::TENSOR_NOT_REQUIRED);
+            //layer.ffn_up_exps   = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert},
+            //        llama_model_loader::TENSOR_NOT_REQUIRED);
+            layer.ffn_exp_probs_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert},
+                    llama_model_loader::TENSOR_NOT_REQUIRED);
+        // shared expert MLP
+            layer.ffn_gate_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp},
+                    llama_model_loader::TENSOR_NOT_REQUIRED);
+            layer.ffn_up_shexp   = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp},
+                    llama_model_loader::TENSOR_NOT_REQUIRED);
+            layer.ffn_down_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd},
+                    llama_model_loader::TENSOR_NOT_REQUIRED);
+        }
+    }
+    return use_mmap_buffer;
+}
+
 bool create_tensors_helper::create_qwen_tensors(const LLM_TN & tn) {
    LOADING_PRELUDE
    create_embd_output(tn, n_embd, n_vocab);
@@ -2784,10 +2854,10 @@ bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & t

 bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool ignore_attn_scale) {
    auto& hparams = model.hparams;
-    const int64_t n_head        = hparams.n_head();
-    const int64_t n_head_kv     = hparams.n_head_kv();
+    const int64_t n_head        = hparams.n_head(i);
+    const int64_t n_head_kv     = hparams.n_head_kv(i);
    const int64_t n_embd        = hparams.n_embd / (hparams.n_deepstack_layers + 1); // For Qwen3-VL we need to divide by the number of deepstack layers + 1, for other models n_deepstack_layers value is 0 by default
-    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
    const int64_t n_embd_head_k = hparams.n_embd_head_k;
    const int64_t n_embd_gqa    = n_embd_v_gqa;

@@ -3111,6 +3181,8 @@ bool create_tensors_helper::create_tensors() {
            use_mmap_buffer = create_mimo2_tensors(tn); break;
        case LLM_ARCH_SEED_OSS:
            use_mmap_buffer = create_seedoss_tensors(tn); break;
+        case LLM_ARCH_STEP35:
+            use_mmap_buffer = create_step35_tensors(tn); break;
        default:
            throw std::runtime_error("unknown architecture");
    }
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1083,6 +1083,7 @@ template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::str

 template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
 template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);

 template std::enable_if<std::is_integral<unsigned int>::value, bool>::type llama_model_loader::get_arr_n<unsigned int>(enum llm_kv, unsigned int&, bool);

--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1334,6 +1334,37 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_STEP35,
+        {
+            {   LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
+            {   LLM_TENSOR_OUTPUT_NORM,       "output_norm" },
+            {   LLM_TENSOR_OUTPUT,            "output" },
+            {   LLM_TENSOR_ROPE_FREQS,        "rope_freqs" },
+            {   LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
+            {   LLM_TENSOR_ROPE_FACTORS_SHORT,"rope_factors_short" },
+            {   LLM_TENSOR_ATTN_NORM,         "blk.%d.attn_norm" },
+            {   LLM_TENSOR_ATTN_Q,            "blk.%d.attn_q" },
+            {   LLM_TENSOR_ATTN_Q_NORM,       "blk.%d.attn_q_norm" },
+            {   LLM_TENSOR_ATTN_K,            "blk.%d.attn_k" },
+            {   LLM_TENSOR_ATTN_K_NORM,       "blk.%d.attn_k_norm" },
+            {   LLM_TENSOR_ATTN_V,            "blk.%d.attn_v" },
+            {   LLM_TENSOR_ATTN_GATE,         "blk.%d.attn_gate" },
+            {   LLM_TENSOR_ATTN_OUT,          "blk.%d.attn_output" },
+            {   LLM_TENSOR_FFN_NORM,          "blk.%d.ffn_norm" },
+            {   LLM_TENSOR_FFN_GATE,          "blk.%d.ffn_gate" },
+            {   LLM_TENSOR_FFN_DOWN,          "blk.%d.ffn_down" },
+            {   LLM_TENSOR_FFN_UP,            "blk.%d.ffn_up" },
+            {   LLM_TENSOR_FFN_GATE_INP,      "blk.%d.ffn_gate_inp" },
+            {   LLM_TENSOR_FFN_GATE_EXPS,     "blk.%d.ffn_gate_exps" },
+            {   LLM_TENSOR_FFN_DOWN_EXPS,     "blk.%d.ffn_down_exps" },
+            {   LLM_TENSOR_FFN_UP_EXPS,       "blk.%d.ffn_up_exps" },
+            {   LLM_TENSOR_FFN_GATE_SHEXP,    "blk.%d.ffn_gate_shexp" },
+            {   LLM_TENSOR_FFN_DOWN_SHEXP,    "blk.%d.ffn_down_shexp" },
+            {   LLM_TENSOR_FFN_UP_SHEXP,      "blk.%d.ffn_up_shexp" },
+	        {   LLM_TENSOR_FFN_EXP_PROBS_B,   "blk.%d.exp_probs_b" },
+        },
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -149,6 +149,7 @@ struct llama_layer {
    struct ggml_tensor * ffn_sub_norm = nullptr;
    struct ggml_tensor * attn_norm_cross = nullptr;
    struct ggml_tensor * attn_norm_enc = nullptr;
+    struct ggml_tensor * wqkv_gate = nullptr;

    // attention
    struct ggml_tensor * wq = nullptr;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5051,6 +5051,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_MINIMAX_M2:
        case LLM_ARCH_MIMO2:
        case LLM_ARCH_SEED_OSS:
+        case LLM_ARCH_STEP35:
            return LLAMA_ROPE_TYPE_NEOX;

        case LLM_ARCH_QWEN2VL: