mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 19:31:48 +00:00
Better CPU prompt processing performance for SWA models (#696)
* This does the trick for PP * Compute mask bounds when creating the mask * Set mask bounds for all supported SWA models --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -2043,6 +2043,10 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * sinks);
|
||||
|
||||
GGML_API void ggml_flash_attn_ext_add_bounds(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * bounds);
|
||||
|
||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
Reference in New Issue
Block a user