diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 3603c040..16c8c24f 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1235,7 +1235,9 @@ void launch_fattn_mma( const int nblocks_stream_k = max_blocks; - const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75; + //const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75; + // On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell + const bool use_stream_k = tiles_efficiency_percent < 75; blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total; blocks_num.y = 1;