Adding ability to have meta data per tensor row (#61)

* POC: per row scale

This is a POC how to work around opinionated ggml to
have scales per row rather than per block.
Only implemened for Zen4 and only for iq2_tn.

* POC per row scale: iq2_tn on NEON

* POC per row scale: iq2_tn on Metal

* Per row scale Metal templates

* iq1_tn: shrink to 1.625 bpw (NEON and Metal)

* POC per row scale: CUDA

* POC per row scale: add CUDA TODOs

There are two places in ggml-cuda.cu left where it is assumed
that type_size * n_per_row / block_size is the way to compute
and handle row sizes. This does not affect simple usage,
but will lead to issues when tensors are split between GPUs.

* Per row scales - CUDA

The only place left where there are unnecessary assumptions being made
is in the Flash Attention code. As we are not using any quants that
use per row scales for quantized KV cache, it should be OK for now.

* Update IQ1_TN and IQ2_TN bpw shown to user

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2024-09-27 08:16:06 +03:00
committed by GitHub
parent 546f3ef349
commit 6dec4af4b6
12 changed files with 171 additions and 138 deletions

View File

@@ -1972,15 +1972,15 @@ void quantize_row_iq2_tn_ref(const float * x, block_iq2_tn * y, int64_t k) {
auto quantize = [] (float xmax, float x) {
return x < -0.5f*xmax ? 0 : x < 0.5f*xmax ? 1 : 2;
};
int n = k;
float max = x[0];
for (int j = 1; j < n; ++j) max = std::max(max, fabsf(x[j]));
*(float *)y = max;
y = (block_iq2_tn *)((float *)y + 1);
for (int ibl = 0; ibl < nb; ++ibl) {
auto xb = x + QK_K*ibl;
float max = xb[0];
for (int j = 0; j < QK_K; ++j) {
float ax = fabsf(xb[j]);
max = std::max(ax, max);
}
y[ibl].d = GGML_FP32_TO_FP16(max);
auto qs = y[ibl].qs;
for (int l = 0; l < QK_K/128; ++l) {
for (int j = 0; j < 32; ++j) {
@@ -1992,7 +1992,7 @@ void quantize_row_iq2_tn_ref(const float * x, block_iq2_tn * y, int64_t k) {
}
}
void quantize_row_iq2_tn(const float * x, void * y, int64_t k) {
void quantize_row_iq2_tn(const float * x, void * y, int64_t k) {
quantize_row_iq2_tn_ref(x, (block_iq2_tn *)y, k);
}
@@ -2009,9 +2009,11 @@ size_t quantize_iq2_tn(const float * src, void * dst, int64_t nrows, int64_t n_p
void dequantize_row_iq2_tn(const block_iq2_tn * x, float * y, int64_t k) {
GGML_ASSERT(k%QK_K == 0);
const float * dptr = (const float *)x;
float d = *dptr;
x = (const block_iq2_tn *)(dptr + 1);
int nb = k/QK_K;
for (int ibl = 0; ibl < nb; ++ibl) {
float d = GGML_FP16_TO_FP32(x[ibl].d);
auto qs = x[ibl].qs;
for (int l = 0; l < QK_K/128; ++l) {
for (int j = 0; j < 32; ++j) {
@@ -2039,13 +2041,14 @@ void vec_dot_iq2_tn_q8_k(int n, float * s, size_t bs, const void * vx, size_t
const int nb = n / QK_K;
const block_iq2_tn * x = (const block_iq2_tn *)vx;
const float * dptr = (const float *)vx;
const float d = *dptr;
const block_iq2_tn * x = (const block_iq2_tn *)(dptr + 1);
const block_q8_K * y = (const block_q8_K *)vy;
float sumf = 0;
for (int i = 0; i < nb; i++) {
float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
auto qs = x[i].qs;
auto q8 = y[i].qs;
int sumi1 = 0, sumi2 = 0, sumi3 = 0,sumi4 = 0;