diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 4006d6f5..95ecdd96 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1639,7 +1639,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
             // check if we should start a new split based on the sources of the current node
             bool need_new_split = false;
-            if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) {
+            if ((node->op == GGML_OP_ADD && node->op_params[0] == 0xff) ||
+                 node->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] == 0xff) {
                 need_new_split = true;
             }
             else if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
@@ -1882,6 +1883,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_backend_sched_split * split, std::array<bool, GGML_SCHED_MAX_BACKENDS> & needs_sync,
         std::vector<int32_t> & ids, std::vector<uint32_t> & unique_ids, ggml_tensor * last_ids_tensor) {
     if (split->n_inputs < 1) return;
+    constexpr bool k_set_sync = false;
     int split_backend_id = split->backend_id;
     ggml_backend_t split_backend = sched->backends[split_backend_id];
     ggml_backend_t last_input_backend = nullptr;
@@ -1906,7 +1908,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
-                needs_sync[split_backend_id] = false;
+                needs_sync[split_backend_id] = k_set_sync;
             }
 
             ggml_tensor * node = split->graph.nodes[0];
@@ -1941,7 +1943,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
                     ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
 
                     ggml_backend_synchronize(ids_backend);
-                    needs_sync[tensor_backend_id(ids_tensor)] = false;
+                    needs_sync[tensor_backend_id(ids_tensor)] = k_set_sync;
 
                     unique_ids.resize((n_expert + 31)/32);
                     std::memset(unique_ids.data(), 0, unique_ids.size()*sizeof(uint32_t));
@@ -2001,7 +2003,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
                     int input_backend_id = tensor_backend_id(input);
                     if (needs_sync[input_backend_id]) {
                         ggml_backend_synchronize(input_backend);
-                        needs_sync[input_backend_id] = false;
+                        needs_sync[input_backend_id] = k_set_sync;
                     }
                     if (needs_sync[split_backend_id]) {
                         if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -2009,7 +2011,7 @@ static void ggml_backend_sched_copy_inputs(ggml_backend_sched_t sched, ggml_back
                         } else {
                             ggml_backend_synchronize(split_backend);
                         }
-                        needs_sync[split_backend_id] = false;
+                        needs_sync[split_backend_id] = k_set_sync;
                     }
                     ggml_backend_tensor_copy(input, input_cpy);
                 }
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 3c7f21bf..e2019b5f 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3411,6 +3411,9 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
     }
 
     if (backend_src != backend_dst) {
+        ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool());
+        ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool());
+        bool needs_f16_f32_copy = false;
         // copy on src stream
         if (cuda_ctx_src->device == cuda_ctx_dst->device) {
             CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -3418,39 +3421,34 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 #ifdef GGML_CUDA_NO_PEER_COPY
             return false;
 #else
-            if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            if (false && src->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->ne[1] >= 32) {
                 //
                 // The goal here is to reduce traffic between GPU's, which is entirely non-negligible
                 // for prompt processing.
                 // We cast the tensor to be copied to f16, copy the f16 data peer-to-peer
                 // and then cast back to f32 on the destination side.
-                // The cost for converting to/from f16 is much ower than the cost of copying
+                // The cost for converting to/from f16 is much lower than the cost of copying
                 // two times more data over PCI-E (well, at least the 30 GB/s PCI-E I have).
-                // iBut for some reason the following is not working.
+                // But for some reason the following is slower.
                 // Can somebody tell me why?
                 //
-                ggml_cuda_pool_alloc<half> tmp_src(cuda_ctx_src->pool(), ggml_nelements(src));
-                ggml_cuda_pool_alloc<half> tmp_dst(cuda_ctx_dst->pool(), ggml_nelements(dst));
+
+                ggml_cuda_set_device(cuda_ctx_dst->device);
+                tmp_dst.alloc(ggml_nelements(dst));
+
+                ggml_cuda_set_device(cuda_ctx_src->device);
+                tmp_src.alloc(ggml_nelements(src));
 
                 auto src_f16 = *src;
                 src_f16.type = GGML_TYPE_F16;
                 for (int i = 0; i < 4; ++i) src_f16.nb[i] /= 2;
                 src_f16.data = tmp_src.get();
 
-                auto dst_f16 = *dst;
-                dst_f16.type = GGML_TYPE_F16;
-                for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
-                dst_f16.data = tmp_dst.get();
-
-                ggml_cuda_set_device(cuda_ctx_src->device);
                 ggml_cuda_cpy(*cuda_ctx_src, src, &src_f16, true);
-                CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
 
-                CUDA_CHECK(cudaMemcpyPeerAsync(dst_f16.data, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&dst_f16), cuda_ctx_src->stream()));
+                CUDA_CHECK(cudaMemcpyPeerAsync(tmp_dst.ptr, cuda_ctx_dst->device, src_f16.data, cuda_ctx_src->device, ggml_nbytes(&src_f16), cuda_ctx_src->stream()));
 
-                ggml_cuda_set_device(cuda_ctx_dst->device);
-                CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_dst->stream()));
-                ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
+                needs_f16_f32_copy = true;
 
             } else {
                 CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
@@ -3467,7 +3465,15 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
         CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
 
         // wait on dst stream for the copy to complete
+        ggml_cuda_set_device(cuda_ctx_dst->device);
         CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
+        if (needs_f16_f32_copy) {
+            auto dst_f16 = *dst;
+            dst_f16.type = GGML_TYPE_F16;
+            for (int i = 0; i < 4; ++i) dst_f16.nb[i] /= 2;
+            dst_f16.data = tmp_dst.get();
+            ggml_cuda_cpy(*cuda_ctx_dst, &dst_f16, dst, true);
+        }
     } else {
         // src and dst are on the same backend
         CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
index b3c52006..10df89a5 100644
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -321,6 +321,15 @@ static __global__ void k_fast_add(int64_t ne0, int64_t nelem, const float * x, c
     z[i] = x[i] + y[i % ne0];
 }
 
+template <typename src1_t, typename src2_t, typename dst_t>
+static __global__ void k_fast_add_2(int64_t ne0, int64_t nelem, const src1_t * x, const src2_t * y, dst_t * z) {
+    int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+    if (i >= nelem) {
+        return;
+    }
+    z[i] = (dst_t)((float)x[i] + (float)y[i]);
+}
+
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     if (ggml_nrows(dst->src[1]) == 1 && dst->src[0]->ne[0] == dst->src[1]->ne[0] &&
         dst->type == GGML_TYPE_F32 && dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32 &&
@@ -332,6 +341,45 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
                 (const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
         return;
     }
+    if (ggml_is_contiguous(dst->src[0]) && ggml_are_same_shape(dst->src[0], dst->src[1]) && ggml_is_contiguous(dst)) {
+        constexpr int kBlockSize = 256;
+        auto nelem = ggml_nelements(dst);
+        int nblocks = (nelem + kBlockSize - 1)/kBlockSize;
+        if (dst->type == GGML_TYPE_F16) {
+            if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const half *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const half *)dst->src[0]->data, (const float *)dst->src[1]->data, (half *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (half *)dst->data);
+            } else {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
+            }
+        } else {
+            if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const half *)dst->src[0]->data, (const half *)dst->src[1]->data, (float *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const half *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data);
+            } else {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (float *)dst->data);
+            }
+        }
+        return;
+    }
     ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index c1b83cfc..b5fe2d87 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -542,7 +542,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     char ** dest_ptrs_d = nullptr;
     int graph_cpynode_index = -1;
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
+    if(!disable_indirection_for_this_node && ctx.cuda_graph && ctx.cuda_graph->use_cpy_indirection) {
         dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
         graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
     }
@@ -651,7 +651,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
     }
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
-    if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
+    if(!disable_indirection_for_this_node && ctx.cuda_graph && ctx.cuda_graph->use_cpy_indirection) {
         ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
     }
 #else
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
index c4619e60..26a21088 100644
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -1,14 +1,14 @@
 #include "norm.cuh"
 
-template <int block_size>
-static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
+template <int block_size, typename T>
+static __global__ void norm_f32(const T * x, float * dst, const int ncols, const float eps) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
     float2 mean_var = make_float2(0.f, 0.f);
 
     for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
+        const float xi = (float)x[row*ncols + col];
         mean_var.x += xi;
         mean_var.y += xi * xi;
     }
@@ -32,7 +32,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
     const float inv_std = rsqrtf(var + eps);
 
     for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
+        dst[row*ncols + col] = (T)(((float)x[row*ncols + col] - mean) * inv_std);
     }
 }
 
@@ -261,14 +261,15 @@ static __global__ void fused_rms_norm_f32_nc(
     }
 }
 
-static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
+template <typename T>
+static void norm_f32_cuda(const T * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     if (ncols < 1024) {
         const dim3 block_dims(WARP_SIZE, 1, 1);
-        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+        norm_f32<WARP_SIZE, T><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
     } else {
         const dim3 block_dims(1024, 1, 1);
-        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+        norm_f32<1024, T><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
     }
 }
 
@@ -364,7 +365,7 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     GGML_ASSERT(ggml_is_contiguous(src0));
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
     const int64_t ne00 = src0->ne[0];
@@ -373,7 +374,11 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
 
-    norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
+    if (src0->type == GGML_TYPE_F32) {
+        norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
+    } else {
+        norm_f32_cuda((const half *)src0_d, dst_d, ne00, nrows, eps, stream);
+    }
 }
 
 void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 983c84a8..938d19de 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7232,7 +7232,12 @@ static struct ggml_tensor * ggml_norm_impl(
         is_node = true;
     }
 
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    if (inplace && a->type != GGML_TYPE_F32) {
+        GGML_ABORT("Fatal error");
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : a->type == GGML_TYPE_F32 ? ggml_dup_tensor(ctx, a)
+        : ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
 
     ggml_set_op_params(result, &eps, sizeof(eps));
 
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index bf7a141c..ba2ffc32 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1511,9 +1511,12 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
             ggml_tensor * wq, ggml_tensor * bq,
             ggml_tensor * wk, ggml_tensor * bk,
             ggml_tensor * wv, ggml_tensor * bv,
-            float attention_scale, int il) const {
+            float attention_scale, int il, bool add_graph_split) const {
     auto Qcur = llm_build_lora_mm(lctx, ctx0, wq, cur);
     cb(Qcur, "Qcur", il);
+    if (add_graph_split) {
+        Qcur->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
+    }
     auto Kcur = llm_build_lora_mm(lctx, ctx0, wk, cur);
     cb(Kcur, "Kcur", il);
     auto Vcur = llm_build_lora_mm(lctx, ctx0, wv, cur);
@@ -1550,11 +1553,14 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
             ggml_tensor * wq, ggml_tensor * bq,
             ggml_tensor * wk, ggml_tensor * bk,
             ggml_tensor * wv, ggml_tensor * bv,
-            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const {
+            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, bool add_graph_split) const {
     const int64_t n_embd_head = hparams.n_embd_head_v;
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
     if (wqkv) {
         auto qkv = llm_build_lora_mm(lctx, ctx0, wqkv, cur);
+        if (add_graph_split) {
+            qkv->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
+        }
         cb(qkv, "qkv", il);
         if (bqkv) {
             qkv = ggml_add(ctx0, qkv, bqkv);
@@ -1586,6 +1592,9 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
 
     if (wqk) {
         auto qk = llm_build_lora_mm(lctx, ctx0, wqk, cur);
+        if (add_graph_split) {
+            qk->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
+        }
         cb(qk, "qkv", il);
         if (bqk) {
             qk = ggml_add(ctx0, qk, bqk);
@@ -1618,7 +1627,7 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
 
     }
 
-    auto [Q, K, V] = llm_build_mul_mat_qkv(gf, cur, wq, bq, wk, bk, wv, bv, attention_scale, il);
+    auto [Q, K, V] = llm_build_mul_mat_qkv(gf, cur, wq, bq, wk, bk, wv, bv, attention_scale, il, add_graph_split);
     auto Qcur = ggml_reshape_3d(ctx0, Q, n_embd_head, Q->ne[0]/n_embd_head, n_tokens);
     if (q_norm) {
         Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il);
@@ -1743,7 +1752,8 @@ ggml_cgraph * llm_build_context::build_llama() {
 
         // self-attention
         if (use_rope) {
-            cur = build_std_attention(gf, inpL, inp_pos, nullptr, this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il);
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr,
+                    this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il);
         }
         else {
 
@@ -1935,7 +1945,8 @@ ggml_cgraph * llm_build_context::build_mistral3() {
 
         auto rope_factors = build_rope_factors(il);
 
-        cur = build_std_attention(gf, inpL, inp_pos, rope_factors, KQ_mask, nullptr, inp_attn_scale, kq_scale, hparams.f_attention_scale, 0, il);
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, rope_factors, KQ_mask,
+                nullptr, inp_attn_scale, kq_scale, hparams.f_attention_scale, 0, il);
 
         if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
@@ -3927,7 +3938,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
         //cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
         //cb(cur, "attn_norm", il);
 
-        cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il);
+        cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il);
 
         if (il == n_layer - 1) {
             // skip computing output for unused tokens
@@ -6806,7 +6817,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
 
         // self-attention
         if (rope_cache == nullptr) {
-            cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il);
+            cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il);
         } else {
             // Pre-attention norm
             cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
@@ -7218,60 +7229,47 @@ ggml_cgraph * llm_build_context::build_cohere2() {
         struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
 
         // norm
-        cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
+        auto attn_norm = model.layers[il].attn_norm;
+        int id = -1;
+        if (attn_norm->extra) {
+            auto extra = (ggml_split_tensor_t *)attn_norm->extra;
+            for (int i = extra->n_device-1; i >= 0; --i) {
+                if (extra->splits[i]) {
+                    attn_norm = extra->splits[i];
+                    id = i;
+                    break;
+                }
+            }
+        }
+        cur = llm_build_norm(ctx0, inpL, hparams, attn_norm, NULL, LLM_NORM, cb, il);
+        if (id >= 0) {
+            ggml_backend_sched_set_tensor_backend(lctx.sched, cur->src[0], ggml_backend_sched_get_backend(lctx.sched, id));
+        }
         cb(cur, "attn_norm", il);
-        struct ggml_tensor * ffn_inp = cur;
+        auto ffn_inp = cur;
 
         // self-attention
-        {
-            // rope freq factors for 128k context
-            struct ggml_tensor * rope_factors = build_rope_factors(il);
-
-            auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur,
-                    model.layers[il].wqkv, model.layers[il].bqkv,
-                    model.layers[il].wqk, model.layers[il].bqk,
-                    model.layers[il].wq, model.layers[il].bq,
-                    model.layers[il].wk, model.layers[il].bk,
-                    model.layers[il].wv, model.layers[il].bv, nullptr, nullptr, 0.f, il);
-
-            if (is_sliding) {
-                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
-                        beta_fast, beta_slow);
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos,
-                        rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
-                        attn_factor, beta_fast, beta_slow);
-                cb(Kcur, "Kcur", il);
-            };
-
-            cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                    KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il, nullptr,
-                    is_sliding ? hparams.n_swa : 0);
-        }
+        auto attn_out = build_std_attention(gf, nullptr, cur, inp_pos, nullptr, KQ_mask_l, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), 0.f,
+                is_sliding ? hparams.n_swa : 0, il, is_sliding, true);
+        cb(attn_out, "attn_out", il);
 
         if (il == n_layer - 1) {
             // skip computing output for unused tokens
             struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-            cur                              = ggml_get_rows(ctx0, cur, inp_out_ids);
-            inpL                             = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            attn_out                         = ggml_get_rows(ctx0, attn_out, inp_out_ids);
             ffn_inp                          = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+            inpL                             = ggml_get_rows(ctx0, inpL, inp_out_ids);
         }
 
-        struct ggml_tensor * attn_out = cur;
-
         // feed-forward network
-        {
-            cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
+        cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
                     NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
-                    cb, il);
-            cb(cur, "ffn_out", il);
-        }
+                    cb, il, gf);
+        cb(cur, "ffn_out", il);
 
         // add together residual + FFN + self-attention
-        cur = ggml_add(ctx0, cur, inpL);
         cur = ggml_add(ctx0, cur, attn_out);
+        cur = ggml_add(ctx0, cur, inpL);
         cur = lctx.cvec.apply_to(ctx0, cur, il);
         cb(cur, "l_out", il);
 
@@ -7280,18 +7278,20 @@ ggml_cgraph * llm_build_context::build_cohere2() {
     }
 
     cur = inpL;
+    //if (cur->type != GGML_TYPE_F32) {
+    //    cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
+    //}
 
     cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
     cb(cur, "result_norm", -1);
 
-    // lm_head
-    cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-    cb(cur, "output", -1);
-
     if (f_logit_scale) {
         cur = ggml_scale(ctx0, cur, f_logit_scale);
+        cb(cur, "result_norm_scaled", -1);
     }
 
+    // lm_head
+    cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
     cb(cur, "result_output", -1);
 
     ggml_build_forward_expand(gf, cur);
@@ -9308,12 +9308,14 @@ ggml_cgraph * llm_build_context::llama_build_graph(
     return result;
 }
 
-ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in,
-        ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale, int n_swa, int il) {
+ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * the_attn_norm,
+        ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in,
+        ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
+        int n_swa, int il, bool do_rope, bool add_graph_split) {
     if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
          model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
         if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
-            ggml_split_tensor_t * attn_norm = model.layers[il].attn_norm ? (ggml_split_tensor_t *)model.layers[il].attn_norm->extra : nullptr;
+            ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr;
             auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra;
             auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra;
             auto wv = (ggml_split_tensor_t *)model.layers[il].wv->extra;
@@ -9368,16 +9370,18 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                         split_wq, bq ? bq->splits[id] : nullptr,
                         split_wk, bk ? bk->splits[id] : nullptr,
                         split_wv, bv ? bv->splits[id] : nullptr,
-                        the_q_norm, the_k_norm, f_attn_scale, il_cb);
+                        the_q_norm, the_k_norm, f_attn_scale, il_cb, add_graph_split);
                 auto rope_factors = rope_factors_in;
                 if (!rope_factors && model.layers[il].rope_freqs && model.layers[il].rope_freqs->extra) {
                     auto extra = (ggml_split_tensor_t *)model.layers[il].rope_freqs->extra;
                     rope_factors = extra->splits[id];
                 }
-                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
+                if (do_rope) {
+                    Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+                    Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+                }
                 cb(Qcur, "Qcur", il_cb);
                 cb(Kcur, "Kcur", il_cb);
                 if (inp_attn_scale) {
@@ -9485,6 +9489,21 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
             }
             GGML_ASSERT(!attn.empty());
             if (attn.size() == 1) return attn.front();
+            //if (attn.size() > 2 && attn.size()%2 == 0) {
+            //    for (int id = 0; id < int(attn.size()/2); ++id) {
+            //        attn[id] = ggml_add(ctx0, attn[2*id+0], attn[2*id+1]);
+            //        attn[id]->op_params[0] = 0xff;
+            //    }
+            //    attn.resize(attn.size()/2);
+            //    auto cur = ggml_add(ctx0, attn[0], attn[1]);
+            //    cur->op_params[0] = 0xff;
+            //    cur->op_params[0] = 0xff;
+            //    for (int id = 2; id < (int)attn.size(); ++id) {
+            //        cur = ggml_add(ctx0, cur, attn[id]);
+            //        cb(cur, "combine_attn", il);
+            //    }
+            //    return cur;
+            //}
             auto cur = ggml_add(ctx0, attn[0], attn[1]);
             cb(cur, "combine_attn", il);
             cur->op_params[0] = 0xff;
@@ -9492,13 +9511,16 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                 cur = ggml_add(ctx0, cur, attn[id]);
                 cb(cur, "combine_attn", il);
             }
+            if (attn.size() > 2) {
+                cur->op_params[0] = 0xff;
+            }
             return cur;
         }
     }
 
     auto cur = input;
-    if (model.layers[il].attn_norm) {
-        cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
+    if (the_attn_norm) {
+        cur = llm_build_norm(ctx0, cur, hparams, the_attn_norm, NULL, LLM_NORM_RMS, cb, il);
         cb(cur, "attn_norm", il);
     }
 
@@ -9508,10 +9530,12 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
             model.layers[il].wq,   model.layers[il].bq, model.layers[il].wk, model.layers[il].bk, model.layers[il].wv, model.layers[il].bv,
             model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il);
 
-    Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+    if (do_rope) {
+        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                 ext_factor, attn_factor, beta_fast, beta_slow);
-    Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+        Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, rope_factors_in, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                 ext_factor, attn_factor, beta_fast, beta_slow);
+    }
     cb(Qcur, "Qcur", il);
     cb(Kcur, "Kcur", il);
 
diff --git a/src/llama-build-context.h b/src/llama-build-context.h
index c5a27654..7d8a0931 100644
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -149,7 +149,7 @@ struct llm_build_context {
             ggml_tensor * wq, ggml_tensor * bq,
             ggml_tensor * wk, ggml_tensor * bk,
             ggml_tensor * wv, ggml_tensor * bv,
-            float attention_scale, int il) const;
+            float attention_scale, int il, bool add_graph_split = false) const;
 
     std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_mul_mat_qkv(ggml_cgraph * gf, ggml_tensor * cur,
             ggml_tensor * wqkv, ggml_tensor * bqkv,
@@ -157,7 +157,7 @@ struct llm_build_context {
             ggml_tensor * wq, ggml_tensor * bq,
             ggml_tensor * wk, ggml_tensor * bk,
             ggml_tensor * wv, ggml_tensor * bv,
-            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il) const;
+            ggml_tensor * q_norm, ggml_tensor * k_norm, float attention_scale, int il, bool add_graph_split = false) const;
 
     ggml_cgraph * build_llama();
 
@@ -407,7 +407,8 @@ llm_expert_gating_func_type   gating_op,
 
     static ggml_cgraph * llama_build_graph(llama_context & lctx, const llama_batch & batch, bool worst_case);
 
-    ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors,
-            ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale, int n_swa, int il);
+    ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * attn_norm, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors,
+            ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
+            int n_swa, int il, bool do_rope = true, bool add_graph_split = false);
 
 };
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
index 2f8794cc..6da4ab7a 100644
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -2259,20 +2259,15 @@ bool create_tensors_helper::create_chatglm_tensors(const LLM_TN & tn) {
 
 bool create_tensors_helper::create_cohere2_tensors(const LLM_TN & tn) {
     LOADING_PRELUDE
-    model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
 
-    // output
-    model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-    // init output from the input tok embed
-    model.output      = create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
-            llama_model_loader::TENSOR_DUPLICATED);
+    create_embd_output(tn, n_embd, n_vocab, true, false); //true);
 
     for (int i = 0; i < n_layer; ++i) {
         auto & layer = model.layers[i];
-        ggml_context * ctx_layer = ctx_for_layer(i);
         ggml_context * ctx_split = ctx_for_layer_split(i);
+        ggml_context * ctx_layer = ctx_for_layer(i);
 
-        layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+        layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
 
         create_std_attn(i, tn, layer, n_embd, n_embd_gqa, ctx_split);
         create_std_ffn (i, tn, layer, n_ff, n_embd, ctx_split);
@@ -2963,6 +2958,23 @@ bool create_tensors_helper::create_tensors() {
         int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
         auto cur_splits = model.splits;
         int adjust_step = std::max(1, int(model.layers.size() / (2*model.splits.size())));
+        if (model.max_gpu > 1 && model.max_gpu < int(cur_splits.size())) {
+            bool equal_split = true;
+            for (int i = 0; i < int(cur_splits.size()); ++i) {
+                float p = i > 0 ? cur_splits[i] - cur_splits[i-1] : cur_splits[i];
+                if (std::abs(p*cur_splits.size() - 1.f) > 0.02f) {
+                    equal_split = false; break;
+                }
+            }
+            if (equal_split) {
+                if (cur_splits.size() % model.max_gpu == 0) {
+                    int nadj = cur_splits.size()/model.max_gpu;
+                    adjust_step = (model.layers.size() + nadj - 1) / nadj;
+                } else {
+                    adjust_step = (model.layers.size() + cur_splits.size() - 1)/cur_splits.size();
+                }
+            }
+        }
         for (int il = 0; il < int(model.layers.size()); ++il) {
             if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) {
                 LLAMA_LOG_INFO("%s: not splitting layer %d because buffer type is host\n", __func__, il);
diff --git a/src/llama.cpp b/src/llama.cpp
index 53dba75a..6b8b2887 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1729,6 +1729,7 @@ static bool is_model_split_supported(const llama_model & model) {
         LLM_ARCH_QWEN3MOE,
         LLM_ARCH_GLM4_MOE,
         LLM_ARCH_MISTRAL3,
+        LLM_ARCH_COHERE2,
     };
     auto it =  k_supported.find(model.arch);
     return it != k_supported.end();