mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-25 00:49:34 +00:00
* FA: provide work buffer for K repacking * Add header to avoid comp0iler warnings * WIP * WIP * WIP * WIP * Slightly better * WIP (Zen4) * WIP * Try to improve for unusual number of heads/number of threads * Use mul_mat_qX_0_q8_2_Tx for q6_0 in FA * Use mul_mat_qX_0_q8_2_Tx for q4_0 in FA * Use Sum4q4 for q4_0 * WIP * WIP * Much better FA TG with q8_0 KV cache Just repack it even for TG. But do the repacking for k_step rows, not the whole K tensor. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
34 lines
1.8 KiB
C++
34 lines
1.8 KiB
C++
//
|
|
// Copyright (C) 2024-2025 Iwan Kawrakow
|
|
// MIT license
|
|
// SPDX-License-Identifier: MIT
|
|
//
|
|
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
|
|
bool iqk_flash_attn_impl(int type_k, // type of k
|
|
int type_v, // type of v
|
|
int Dk, // K head size
|
|
int Dv, // V head size
|
|
int nq, // number of columns in q
|
|
int nk, // number of rows in k
|
|
int stride_q, // distance between q columns in bytes
|
|
int stride_k, // distance between k rows in bytes
|
|
int stride_v, // distance between v rows in bytes
|
|
int stride_m, // distance between mask rows (in bytes
|
|
int stride_qkv, // distance between rows in mask (in bytes)
|
|
const float * q, // q matrix.
|
|
const void * k, // k matrix. Assumed to be fp16, nq x nk elements
|
|
const void * v, // v matrix. Assumed to be fp16, nq x nk elements
|
|
const void * mask, // mask. If not null, assumed to be fp16. nq x nk elements
|
|
float scale, // scale applied before softmax
|
|
float softcap, // if > 0, a "soft-cap" operation is applied before softmax
|
|
float * qkv, // v*softmax(scale*(k*q))
|
|
float * M,
|
|
float * S);
|
|
|
|
void * iqk_repack_k(int type_k, int nek0, int nek1, int nek2, int nek3, long nbk1, long nbk2, long nbk3,
|
|
const void * k, void * work, int ith, int nth, int& repacked_type, uint64_t& row_size);
|