mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-03 02:20:01 +00:00
Bug fixes from mainline (#439)
* Add __syncthreads() to the new FA kernel * Clearing padding --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -22,7 +22,7 @@ extern "C" {
|
|||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
@@ -39,7 +39,7 @@ extern "C" {
|
|||||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
|||||||
return SIZE_MAX;
|
return SIZE_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||||
if (buft->iface.get_alloc_size) {
|
if (buft->iface.get_alloc_size) {
|
||||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||||
@@ -114,7 +114,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
|||||||
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
|
||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2101,13 +2101,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||||
|
|
||||||
|
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
|
||||||
|
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
|
||||||
|
// Therefore, in such cases use cuBLAS.
|
||||||
|
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
|
||||||
|
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
|
||||||
|
|
||||||
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
|
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
// if mmvq is available it's a better choice than dmmv:
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
|
|||||||
@@ -1093,6 +1093,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// do we really need this?
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
// Write back combined meta data:
|
// Write back combined meta data:
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int imeta = 0; imeta < nmeta; ++imeta) {
|
for (int imeta = 0; imeta < nmeta; ++imeta) {
|
||||||
@@ -1112,6 +1115,11 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
|||||||
float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
|
float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
|
||||||
dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
|
dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
|
||||||
}
|
}
|
||||||
|
} else if (np > 1) {
|
||||||
|
// Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
|
||||||
|
// Therefore, all other warps also need to execute a __syncthreads().
|
||||||
|
// Otherwise the points at which warps synchronize with each other would become misaligned.
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
|||||||
Reference in New Issue
Block a user