mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
TG tweak
This commit is contained in:
@@ -810,7 +810,12 @@ void launch_fattn(
|
|||||||
cudaStream_t main_stream = ctx.stream();
|
cudaStream_t main_stream = ctx.stream();
|
||||||
const int id = ggml_cuda_get_device();
|
const int id = ggml_cuda_get_device();
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
const int nsm_actual = ggml_cuda_info().devices[id].nsm;
|
||||||
|
int nsm = nsm_actual;
|
||||||
|
if (Q->ne[1] == 1) {
|
||||||
|
nsm = 1; while (nsm*2 <= nsm_actual) nsm *= 2;
|
||||||
|
if (K->ne[1] <= 16384 && nsm > 32) nsm /= 2;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_cuda_pool_alloc<half> K_f16(pool);
|
ggml_cuda_pool_alloc<half> K_f16(pool);
|
||||||
ggml_cuda_pool_alloc<half> V_f16(pool);
|
ggml_cuda_pool_alloc<half> V_f16(pool);
|
||||||
|
|||||||
Reference in New Issue
Block a user