WIP - not working

2026-02-22 22:24:11 +00:00 · 2026-01-11 12:29:38 +02:00
parent 738dc60b78
commit 6ba5772b07
8 changed files with 249 additions and 55 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1,4 +1,5 @@
 //
+
 // Copyright (C) 2023-2024 The ggml authors
 // Copyright (C) 2024 Iwan Kawrakow
 // MIT license
@@ -2487,23 +2488,24 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten

    if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 &&
        ggml_is_quantized(src0_1->type) &&
-        ggml_is_quantized(src0_2->type) &&
+        (!src0_2 || ggml_is_quantized(src0_2->type)) &&
        ggml_backend_buffer_is_cuda(src0_1->buffer) &&
-        ggml_backend_buffer_is_cuda(src0_2->buffer) &&
+        (!src0_2 || ggml_backend_buffer_is_cuda(src0_2->buffer)) &&
        ggml_backend_buffer_is_cuda(src1->buffer) &&
        ggml_backend_buffer_is_cuda(dst->buffer) &&
        src1->type == GGML_TYPE_F32) {
        int device_id = ctx.device;
        ggml_backend_cuda_buffer_context * src0_1_ctx = (ggml_backend_cuda_buffer_context *) src0_1->buffer->context;
-        ggml_backend_cuda_buffer_context * src0_2_ctx = (ggml_backend_cuda_buffer_context *) src0_2->buffer->context;
+        ggml_backend_cuda_buffer_context * src0_2_ctx = src0_2 ? (ggml_backend_cuda_buffer_context *) src0_2->buffer->context : nullptr;
        ggml_backend_cuda_buffer_context * src1_ctx   = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
        ggml_backend_cuda_buffer_context * dst_ctx    = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        return i;
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        if (src0_1_ctx->device == device_id &&
-            src0_2_ctx->device == device_id &&
+            (!src0_2_ctx || src0_2_ctx->device == device_id) &&
            src1_ctx->device   == device_id &&
            dst_ctx->device    == device_id) {
-            //printf("%s(%s, %s): %ld x %ld x %ld, %ld x %ld x %ld, %ld x %ld x %ld\n", __func__, src0_1->name, src0_2->name,
-            //        src0->ne[0], src0->ne[1], src0->ne[2], src1->ne[0], src1->ne[1], src1->ne[2], ids->ne[0], ids->ne[1], ids->ne[2]);
            // Fast TG path
            const int64_t n_ids = ids->ne[0];
            auto stream = ctx.stream(device_id, 0);
@@ -2518,7 +2520,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten

            const int64_t src1_padded_col_size = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
            ggml_cuda_pool_alloc<char> src1_quantized(ctx.pool());
-            if (ggml_is_quantized(src0_1->type) || ggml_is_quantized(src0_2->type)) {
+            if (ggml_is_quantized(src0_1->type) || (src0_2 && ggml_is_quantized(src0_2->type))) {
                GGML_ASSERT(src1->ne[0] % QK8_1 == 0);
                auto src_1_ddq_size = src1_padded_col_size*sizeof(block_q8_1)/QK8_1;
                local_src1.data = src1_quantized.alloc(src_1_ddq_size);
@@ -2540,7 +2542,8 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
            auto unary_op = (ggml_unary_op)dst->op_params[0];
            ggml_cuda_op_fused_mul_mat_vec_q_id(ctx, src0_1, &local_src1, ids, &local_dst,
                    dst->src[4], dst->src[5],
-                    (const char *)src0_1->data, (const char *)src0_2->data, (const float *)src1->data, src1_quantized.get(),
+                    (const char *)src0_1->data, src0_2 ? (const char *)src0_2->data : nullptr,
+                    (const float *)src1->data, src1_quantized.get(),
                    (float *)local_dst.data, 0, src0_1->ne[1], 1, src1_padded_col_size, unary_op, stream);
            CUDA_CHECK(cudaGetLastError());

@@ -2608,7 +2611,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
    // looks like it really depends just on the total number of experts.
    // TODO: verify with more models, or perhaps make the magic constant '32' to be defined via a compile time define.
    if (src1->ne[2] <= ctx.mmq_id_thresh*src0->ne[2] &&
-        ggml_is_quantized(src0_1->type) && src0_1->type == src0_2->type && src1->ne[1] == 1 && src1->ne[3] == 1 &&
+        ggml_is_quantized(src0_1->type) && (!src0_2 || src0_1->type == src0_2->type) && src1->ne[1] == 1 && src1->ne[3] == 1 &&
        ggml_cuda_can_use_mmq_id(src0_1->type, ggml_cuda_info().devices[ctx.device].cc, src1->ne[2])) {

        const int64_t ne_get_rows = ne12 * n_ids;
@@ -2631,6 +2634,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                src0_1->type, ne10, src1->nb[1] / ts_src1, src1->nb[2] / ts_src1, src1->nb[2] / ts_src1,
                ne10_padded, ne11_flat, 1, 1, stream);

+        if (src0_2) {
        ggml_cuda_pool_alloc<char> dst_up_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
        ggml_cuda_pool_alloc<char> dst_gate_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));

@@ -2662,6 +2666,34 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
                    (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(),
                    (float *)dst->data);
        }
+        } else {
+
+            ggml_cuda_pool_alloc<char> dst_up_gate_contiguous(ctx.pool(), 2*sizeof(float)*ggml_nelements(dst));
+            dst_row.ne[0] *= 2;
+            dst_row.nb[1] *= 2;
+            dst_row.nb[2] *= 2;
+            dst_row.nb[3] *= 2;
+            dst_row.data = dst_up_gate_contiguous.get();
+            ggml_cuda_mul_mat_q_id(ctx, src0_1, src1, ids, &dst_row, (char *)ids_device.get(), src1_quantized.get());
+            if (dst->src[4]) {
+                GGML_ASSERT(!dst->src[5]);
+                ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[4]->data, (const int32_t *)ids->data,
+                        (float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
+                        dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
+                CUDA_CHECK(cudaGetLastError());
+            }
+
+            auto unary_op = (ggml_unary_op)dst->op_params[0];
+            if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
+                ggml_swiglu_oai_cuda_f32((const float *)dst_up_gate_contiguous.get(), (const float *)dst_up_gate_contiguous.get() + dst->ne[0],
+                        (float *)dst->data, ggml_nelements(dst), dst->ne[0],  dst->ne[0],  dst->ne[0],
+                        1.702f, 7.0f, stream);
+            } else {
+                ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst), dst->ne[0],
+                        (const float *)dst_up_gate_contiguous.get(),
+                        (float *)dst->data);
+            }
+        }
        CUDA_CHECK(cudaGetLastError());

        if (next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) &&
@@ -3603,7 +3635,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
            auto src0_2 = node->src[1];
            auto src1   = node->src[2];
            if (src1->ne[1] != 1 || src1->ne[2] != 1 || src1->ne[3] != 1 || src1->type != GGML_TYPE_F32 ||
-                !ggml_is_quantized(src0_1->type) || !ggml_is_quantized(src0_2->type)) {
+                !ggml_is_quantized(src0_1->type) || (src0_2 && !ggml_is_quantized(src0_2->type))) {
                use_cuda_graph = false;
            } else {
                if (i < cgraph->n_nodes-1) {
@@ -3967,8 +3999,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                bool is_fused_up_gate = op->op == GGML_OP_MOE_FUSED_UP_GATE || op->op == GGML_OP_FUSED_UP_GATE;
                struct ggml_tensor * a = op->src[0];
                struct ggml_tensor * b = is_fused_up_gate ? op->src[2] : op->src[1];
-                if (is_fused_up_gate && a->type != op->src[1]->type) {
-                    printf("%s: returning false for GGML_OP_MOE_FUSED_UP_GATE because src0->type != src1->type\n", __func__);
+                if (is_fused_up_gate && op->src[1] && a->type != op->src[1]->type) {
+                    fprintf(stderr, "%s: returning false for GGML_OP_MOE_FUSED_UP_GATE because src0->type != src1->type\n", __func__);
                    return false;
                }
                //==================================================================
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -61,6 +61,18 @@ static __global__ void fused_mul_silu_f32(const float * x, const float * y, floa
    dst[i] = x[i] * y[i] / (1.0f + expf(-x[i]));
 }

+static __global__ void fused_mul_silu_f32(const float * x, float * dst, const int k, const int ne0) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    int row = i / ne0;
+    int j   = i % ne0;
+    auto x_row = x + 2*row*ne0;
+    dst[i] = x_row[j] * x_row[j + ne0] / (1.0f + expf(-x[j]));
+}
+
 static __global__ void fused_mul_relu_f32(const float * x, const float * y, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

@@ -70,6 +82,18 @@ static __global__ void fused_mul_relu_f32(const float * x, const float * y, floa
    dst[i] = fmaxf(x[i], 0) * y[i];
 }

+static __global__ void fused_mul_relu_f32(const float * x, float * dst, const int k, const int ne0) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    int row = i / ne0;
+    int j   = i % ne0;
+    auto x_row = x + 2*row*ne0;
+    dst[i] = fmaxf(x_row[j], 0) * x_row[j + ne0];
+}
+
 static __global__ void fused_mul_gelu_f32(const float * x, const float * y, float * dst, const int k) {
    constexpr float GELU_COEF_A    = 0.044715f;
    constexpr float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -82,6 +106,21 @@ static __global__ void fused_mul_gelu_f32(const float * x, const float * y, floa
    dst[i] = 0.5f*xi*y[i]*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
 }

+static __global__ void fused_mul_gelu_f32(const float * x, float * dst, const int k, const int ne0) {
+    constexpr float GELU_COEF_A    = 0.044715f;
+    constexpr float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    int row = i / ne0;
+    int j   = i % ne0;
+    auto x_row = x + 2*row*ne0;
+    float xi = x_row[j];
+    dst[i] = 0.5f*xi*x_row[j+ne0]*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
+}
+
 static __global__ void tanh_f32(const float * x, float * dst, int k) {
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
@@ -199,6 +238,21 @@ static void fused_mul_gelu_f32_cuda(const float * x, const float * y, float * ds
    fused_mul_gelu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 }

+static void fused_mul_silu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    fused_mul_silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
+}
+
+static void fused_mul_relu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    fused_mul_relu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
+}
+
+static void fused_mul_gelu_f32_cuda(const float * x, float * dst, const int k, const int ne0, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    fused_mul_gelu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k, ne0);
+}
+
 static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -302,29 +356,33 @@ void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
    }
 }

+void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
+        int64_t nelements, int64_t ne0, const float * src0_d, float * dst_d) {
+
+    cudaStream_t stream = ctx.stream();
+
+    switch (op) {
+        case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
+        case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
+        case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, dst_d, nelements, ne0, stream); break;
+        default: GGML_ASSERT(false);
+    }
+}
+
 void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, src1));
-
    ggml_unary_op op = (ggml_unary_op)dst->op_params[0];
+    GGML_ASSERT(ggml_is_contiguous(src0));

-    ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data);
-
-    //cudaStream_t stream = ctx.stream();
-
-    //const float * src0_d = (const float *)src0->data;
-    //const float * src1_d = (const float *)src1->data;
-    //float * dst_d = (float *)dst->data;
-
-    //switch (op) {
-    //    case GGML_UNARY_OP_SILU: fused_mul_silu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
-    //    case GGML_UNARY_OP_RELU: fused_mul_relu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
-    //    case GGML_UNARY_OP_GELU: fused_mul_gelu_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), stream); break;
-    //    default: GGML_ASSERT(false);
-    //}
+    if (src1) {
+        GGML_ASSERT(ggml_are_same_shape(src0, dst));
+        GGML_ASSERT(ggml_are_same_shape(src0, src1));
+        ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), (const float *)src0->data, (const float *)src1->data, (float *)dst->data);
+    } else {
+        GGML_ASSERT(src0->ne[0] == 2*dst->ne[0] && src0->ne[1] == dst->ne[1] && src0->ne[2] == dst->ne[2] && src0->ne[3] == dst->ne[3]);
+        ggml_fused_mul_unary(ctx, op, ggml_nelements(dst), dst->ne[0], (const float *)src0->data, (float *)dst->data);
+    }
 }

 void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -89,4 +89,7 @@ void ggml_cuda_op_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_tensor *
 void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
        int64_t nelements, const float * x, const float * y, float * z);

+void ggml_fused_mul_unary(ggml_backend_cuda_context & ctx, ggml_unary_op op,
+        int64_t nelements,int64_t ne0,  const float * x, float * z);
+
 void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7628,13 +7628,13 @@ struct ggml_tensor * ggml_moe_up_gate(
            struct ggml_tensor  * b,
            struct ggml_tensor  * ids,
            enum   ggml_unary_op  op) {
-    if (as_up->type != as_gate->type || !ggml_are_same_shape(as_up, as_gate)) {
+    if (as_gate && (as_up->type != as_gate->type || !ggml_are_same_shape(as_up, as_gate))) {
        struct ggml_tensor * result_up   = ggml_mul_mat_id(ctx, as_up,   b, ids);
        struct ggml_tensor * result_gate = ggml_mul_mat_id(ctx, as_gate, b, ids);
        return ggml_fused_mul_unary(ctx, result_gate, result_up, op);
    }
    GGML_ASSERT(!ggml_is_transposed(as_up));
-    GGML_ASSERT(!ggml_is_transposed(as_gate));
+    GGML_ASSERT(!as_gate || !ggml_is_transposed(as_gate));
    GGML_ASSERT(ids->type == GGML_TYPE_I32);

    GGML_ASSERT(as_up->ne[3] == 1); // as is 3d (one matrix per expert)
@@ -7650,7 +7650,7 @@ struct ggml_tensor * ggml_moe_up_gate(
        is_node = true;
    }

-    const int64_t ne[4] = { as_up->ne[1], ids->ne[0], b->ne[2], 1 };
+    const int64_t ne[4] = { as_gate ? as_up->ne[1] : as_up->ne[1]/2, ids->ne[0], b->ne[2], 1 };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    result->op   = GGML_OP_MOE_FUSED_UP_GATE;
@@ -7681,7 +7681,7 @@ struct ggml_tensor * ggml_moe_up_gate_ext(
        return ggml_moe_up_gate(ctx, as_up, as_gate, b, ids, op);
    }

-    if (as_up->type != as_gate->type || !ggml_are_same_shape(as_up, as_gate)) {
+    if (as_gate && (as_up->type != as_gate->type || !ggml_are_same_shape(as_up, as_gate))) {
        struct ggml_tensor * result_up   = ggml_mul_mat_id(ctx, as_up,   b, ids);
        if (as_up_b) {
            result_up = ggml_add_id(ctx, result_up, as_up_b, ids);
@@ -7694,7 +7694,7 @@ struct ggml_tensor * ggml_moe_up_gate_ext(
    }

    GGML_ASSERT(!ggml_is_transposed(as_up));
-    GGML_ASSERT(!ggml_is_transposed(as_gate));
+    GGML_ASSERT(!as_gate || !ggml_is_transposed(as_gate));
    GGML_ASSERT(ids->type == GGML_TYPE_I32);

    GGML_ASSERT(as_up->ne[3] == 1); // as is 3d (one matrix per expert)
@@ -7705,10 +7705,10 @@ struct ggml_tensor * ggml_moe_up_gate_ext(
    GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast

    GGML_ASSERT(as_up->ne[1] == as_up_b->ne[0]);
-    GGML_ASSERT(as_gate->ne[1] == as_gate_b->ne[0]);
+    GGML_ASSERT(!as_gate || as_gate->ne[1] == as_gate_b->ne[0]);
    bool is_node = false;

-    const int64_t ne[4] = { as_up->ne[1], ids->ne[0], b->ne[2], 1 };
+    const int64_t ne[4] = { as_gate ? as_up->ne[1] : as_up->ne[1]/2, ids->ne[0], b->ne[2], 1 };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    result->op   = GGML_OP_MOE_FUSED_UP_GATE;
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -910,7 +910,8 @@ ggml_tensor * llm_build_context::llm_build_moe_ffn(
                       bool   scale_w,
                      float   w_scale,
 llm_expert_gating_func_type   gating_op,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
+         ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b) {

    auto input = cur;

@@ -1025,6 +1026,19 @@ llm_expert_gating_func_type   gating_op,
    bool can_use_fmoe = type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE;

    ggml_tensor * par;
+    if (can_use_fmoe && up_gate_exps) {
+        if (up_gate_exps_b) {
+            par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr,
+                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
+                    type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
+        } else {
+            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+            par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
+                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
+        }
+    } else {
+    GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
+
    if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
        if (up_exps_b || gate_exps_b) {
            par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
@@ -1069,6 +1083,7 @@ llm_expert_gating_func_type   gating_op,
            GGML_ABORT("fatal error");
        }

+    }
    }
    cb(par, "ffn_moe_gate_par", il);

@@ -1130,7 +1145,8 @@ ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama
                      float   w_scale,
 llm_expert_gating_func_type   gating_op,
            llm_ffn_op_type   type_op_shexp,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
+         ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b) {

    auto split_up_exps    = (ggml_split_tensor_t *)up_exps->extra;
    auto split_gate_exps  = (ggml_split_tensor_t *)gate_exps->extra;
@@ -1164,7 +1180,7 @@ llm_expert_gating_func_type   gating_op,
                    the_exp_probs_b,
                    n_expert, n_expert_used,
                    type_op, norm_w, scale_w, w_scale,
-                    gating_op, cb, il, graph, false);
+                    gating_op, cb, il, graph, false, up_gate_exps, up_gate_exps_b);
        cb(routed_out, "routed_out", il);
        if (add_input) {
            routed_out = ggml_add(ctx, routed_out, input);
@@ -8410,7 +8426,8 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
                n_expert, n_expert_used,
                LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f,
                LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
-                LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true);
+                LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true,
+                model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b);

        cur = lctx.cvec.apply_to(ctx0, cur, il);
        cb(cur, "l_out", il);
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -354,7 +354,8 @@ struct llm_build_context {
                       bool   scale_w,
                      float   w_scale,
 llm_expert_gating_func_type   gating_op,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false);
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false,
+         ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr);

    static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
         ggml_tensor * cur,
@@ -370,7 +371,8 @@ llm_expert_gating_func_type   gating_op,
                       bool   scale_w,
                      float   w_scale,
 llm_expert_gating_func_type   gating_op,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false,
+         ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr) {
        return llm_build_moe_ffn(ctx, lctx, cur,
                gate_inp,   nullptr,
                up_exps,    nullptr,
@@ -379,7 +381,7 @@ llm_expert_gating_func_type   gating_op,
                exp_probs_b,
                n_expert, n_expert_used,
                type_op, norm_w, scale_w, w_scale,
-                gating_op, cb, il, graph, add_input);
+                gating_op, cb, il, graph, add_input, up_gate_exps, up_gate_exps_b);
    }

    static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx,
@@ -401,7 +403,8 @@ llm_expert_gating_func_type   gating_op,
                      float   w_scale,
 llm_expert_gating_func_type   gating_op,
            llm_ffn_op_type   type_op_shexp,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false);
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false,
+         ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr);

    static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids);

--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -31,6 +31,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {

    bool merge_qkv(const LLM_TN & tn, int i, int bias, bool ignore_attn_scale = false);

+    bool merge_up_gate_exps(const LLM_TN & tn, int i, int bias);
+
    bool create_tensors() override;

    bool create_llama_tensors(const LLM_TN & tn);
@@ -141,6 +143,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
    ggml_tensor * create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0,
            ggml_context ** actual_ctx = nullptr);

+    ggml_context * get_context_for_tensor(ggml_context * ctx, const std::string & name);
+
    void create_default_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool norm_bias);
    void create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm = true, bool use_ctx_split = false);

@@ -288,9 +292,7 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
    return result;
 }

-ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
-        int flags, ggml_context ** actual_context) {
-    //auto requested_ctx = ctx;
+ggml_context * create_tensors_helper::get_context_for_tensor(ggml_context * ctx, const std::string & name) {
    if (ml.tensor_buft_overrides) {
        for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
            std::regex pattern(overrides->pattern);
@@ -301,6 +303,12 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
            }
        }
    }
+    return ctx;
+}
+
+ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
+        int flags, ggml_context ** actual_context) {
+    ctx = get_context_for_tensor(ctx, name);
    if (actual_context) *actual_context = ctx;
    auto tensor = ml.create_tensor(ctx, name, ne, flags);
    if (tensor && ctx == split_ctx) {
@@ -2572,9 +2580,18 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {

        ggml_context *ctx_ffn_gate, *ctx_ffn_up, *ctx_ffn_down;
        layer.ffn_gate_inp  = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
-        layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_gate);
-        layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0, &ctx_ffn_down);
-        layer.ffn_up_exps   = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_up);
+        bool merged = merge_up_gate_exps(tn, i, 2);
+        use_mmap_buffer &= !merged;
+        if (merged) {
+            ctx_ffn_gate = ctx_ffn_up = ctx_split;
+        } else {
+            layer.ffn_up_exps   = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i),
+                    {  n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_up);
+            layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i),
+                    {  n_embd, n_ff_exp, n_expert}, 0, &ctx_ffn_gate);
+        }
+        layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i),
+                {n_ff_exp, n_embd, n_expert}, 0, &ctx_ffn_down);

        // bias
        layer.ffn_gate_inp_b  = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
@@ -2582,15 +2599,17 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
        auto ctx_gate_b = ctx_ffn_gate == ctx_split ? ctx_split : ctx_layer;
        auto ctx_down_b = ctx_ffn_down == ctx_split ? ctx_split : ctx_layer;
        auto ctx_up_b   = ctx_ffn_up   == ctx_split ? ctx_split : ctx_layer;
-        layer.ffn_gate_exps_b = create_tensor(ctx_gate_b, tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_gate_b);
+        if (!merged) {
+            layer.ffn_up_exps_b   = create_tensor(ctx_up_b,   tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_up_b);
+            layer.ffn_gate_exps_b = create_tensor(ctx_gate_b, tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_gate_b);
+        }
        layer.ffn_down_exps_b = create_tensor(ctx_down_b, tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0, &ctx_ffn_down_b);
-        layer.ffn_up_exps_b   = create_tensor(ctx_up_b,   tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0, &ctx_ffn_up_b);

-        if (ctx_ffn_gate_b != ctx_ffn_gate) {
+        if (!merged && ctx_ffn_gate_b != ctx_ffn_gate) {
            layer.ffn_gate_exps_b_dup = create_tensor(ctx_ffn_gate, tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert},
                    llama_model_loader::TENSOR_DUPLICATED);
        }
-        if (ctx_ffn_up_b != ctx_ffn_up) {
+        if (!merged && ctx_ffn_up_b != ctx_ffn_up) {
            layer.ffn_up_exps_b_dup = create_tensor(ctx_ffn_up, tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert},
                    llama_model_loader::TENSOR_DUPLICATED);
        }
@@ -2654,6 +2673,66 @@ bool create_tensors_helper::create_smollm3_tensors(const LLM_TN & tn) {
    return use_mmap_buffer;
 }

+bool create_tensors_helper::merge_up_gate_exps(const LLM_TN & tn, int i, int bias) {
+    ggml_context * ctx_split = ctx_for_layer_split(i);
+
+    auto & layer = model.layers[i];
+
+    auto u_name = tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i);
+    auto g_name = tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i);
+    auto u_meta = ml.require_tensor_meta(u_name.c_str());
+    auto g_meta = ml.require_tensor_meta(g_name.c_str());
+
+    if (u_meta->type != g_meta->type || u_meta->ne[0] != g_meta->ne[0] || u_meta->ne[2] != g_meta->ne[2]) {
+        printf("%s: not merging because up/fate meta info is different\n", __func__);
+        return false;
+    }
+
+    auto u_ctx = get_context_for_tensor(ctx_split, u_name);
+    auto g_ctx = get_context_for_tensor(ctx_split, g_name);
+
+    if (u_ctx != ctx_split || g_ctx != ctx_split) {
+        printf("%s: not merging because of context\n", __func__);
+        return false;
+    }
+
+    printf("%s: mergin up/gate in layer %d\n", __func__, i);
+
+    layer.ffn_up_gate_exps = ggml_new_tensor_3d(ctx_split, u_meta->type, u_meta->ne[0], u_meta->ne[1] + g_meta->ne[1], u_meta->ne[2]);
+    snprintf(layer.ffn_up_gate_exps->name, GGML_MAX_NAME, "blk.%d.ffn_up_gate_exps.weight", i);
+    layer.ffn_up_exps   = ml.create_tensor_as_view(ctx_split, layer.ffn_up_gate_exps, u_name.c_str(),
+            { u_meta->ne[0], u_meta->ne[1], u_meta->ne[2] }, 0);
+    layer.ffn_gate_exps = ml.create_tensor_as_view(ctx_split, layer.ffn_up_gate_exps, g_name.c_str(),
+            { g_meta->ne[0], g_meta->ne[1], g_meta->ne[2] }, u_meta->ne[1]*u_meta->nb[1] );
+
+    if (!bias) return true;
+
+    auto u_name_b = tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i);
+    auto g_name_b = tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i);
+    auto u_meta_b = ml.get_tensor_meta(u_name_b.c_str());
+    auto g_meta_b = ml.get_tensor_meta(g_name_b.c_str());
+    if (bias == 2) {
+        GGML_ASSERT(u_meta_b && g_meta_b);
+        GGML_ASSERT(u_meta_b->type == g_meta_b->type);
+        GGML_ASSERT(u_meta_b->ne[1] == g_meta_b->ne[1]);
+    } else {
+        GGML_ASSERT(!u_meta_b && !g_meta_b);
+        return true;
+    }
+
+    GGML_ASSERT(u_meta->ne[1] == u_meta_b->ne[0]);
+    GGML_ASSERT(g_meta->ne[1] == g_meta_b->ne[0]);
+
+    layer.ffn_up_gate_exps_b = ggml_new_tensor_2d(ctx_split, u_meta_b->type, u_meta_b->ne[0] + g_meta_b->ne[0], u_meta->ne[1]);
+    snprintf(layer.ffn_up_gate_exps_b->name, GGML_MAX_NAME, "blk.%d.ffn_up_gate_exps.bias", i);
+    layer.ffn_up_exps_b   = ml.create_tensor_as_view(ctx_split, layer.ffn_up_gate_exps_b, u_name_b.c_str(),
+            { u_meta_b->ne[0], u_meta_b->ne[1] }, 0);
+    layer.ffn_gate_exps_b = ml.create_tensor_as_view(ctx_split, layer.ffn_up_gate_exps_b, g_name_b.c_str(),
+            { g_meta_b->ne[0], g_meta_b->ne[1] }, u_meta->nb[1]);
+
+    return true;
+}
+
 bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool ignore_attn_scale) {
    auto& hparams = model.hparams;
    const int64_t n_head        = hparams.n_head();
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -236,6 +236,7 @@ struct llama_layer {
    struct ggml_tensor * ffn_gate_exps = nullptr;
    struct ggml_tensor * ffn_down_exps = nullptr;
    struct ggml_tensor * ffn_up_exps  = nullptr;
+    struct ggml_tensor * ffn_up_gate_exps  = nullptr;

    llama_split_tensor split_ffn_gate_inp;
    llama_split_tensor split_ffn_up_exps;
@@ -247,6 +248,7 @@ struct llama_layer {
    struct ggml_tensor * ffn_gate_exps_b = nullptr;
    struct ggml_tensor * ffn_down_exps_b = nullptr;
    struct ggml_tensor * ffn_up_exps_b = nullptr;
+    struct ggml_tensor * ffn_up_gate_exps_b = nullptr;
    struct ggml_tensor * ffn_gate_exps_b_dup = nullptr;
    struct ggml_tensor * ffn_down_exps_b_dup = nullptr;
    struct ggml_tensor * ffn_up_exps_b_dup = nullptr;