diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 900934f9..8753f5fc 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -3471,6 +3471,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ needs_f16_f32_copy = true; } else { + printf("%s on device %d -> %s on device %d\n", src->name, cuda_ctx_src->device, dst->name, cuda_ctx_dst->device); ggml_cuda_set_device(cuda_ctx_src->device); CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); } diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index e19cf821..41c04c45 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -663,7 +663,8 @@ ggml_tensor * llm_build_context::llm_build_ffn( GGML_ASSERT((!split_u && !iextra->splits[id]) || (split_u && iextra->splits[id])); } if (!split_u) continue; - auto cur = iextra ? iextra->splits[id] : input; + //auto cur = iextra ? iextra->splits[id] : input; + auto cur = input; if (ffn_norm && ffn_norm->extra) { auto norm = (ggml_split_tensor_t *)ffn_norm->extra; GGML_ASSERT(norm->splits[id]); @@ -671,7 +672,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( cb(cur, "ffn_inp_normed", il_cb); } if (cur->type != GGML_TYPE_F32) { - cur = ggml_cast(ctx, input, GGML_TYPE_F32); + cur = ggml_cast(ctx, cur, GGML_TYPE_F32); } cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op); cb(cur, "ffn_up_gate", il_cb); @@ -9398,7 +9399,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens GGML_ASSERT((!split_wq && !iextra->splits[id]) || (split_wq && iextra->splits[id])); } if (!split_wq) continue; - auto cur = iextra ? iextra->splits[id] : input; + //auto cur = iextra ? iextra->splits[id] : input; + auto cur = input; if (attn_norm) { auto split_norm = attn_norm->splits[id]; cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il);