mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Check for NaNs while loading the model. (#727)
* Check for NaNs while loading the model. * Also tell which experts have NaNs. * Add command line option to validate quants * Add checks for more quantization types * Add checks for more quantizagtion types --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -9868,3 +9868,175 @@ void vec_dot_iq4_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename Block>
|
||||
inline int check_row_for_blocks_256_fp16(int nblock, const Block * x) {
|
||||
int nbad = 0;
|
||||
for (int ib = 0; ib < nblock; ++ib) {
|
||||
float d = GGML_FP16_TO_FP32(x[ib].d);
|
||||
if (isnan(d)) ++nbad;
|
||||
}
|
||||
return nbad;
|
||||
}
|
||||
template <typename Block>
|
||||
bool check_tensor_for_blocks_256_fp16(const ggml_tensor * tensor) {
|
||||
int nblock = tensor->ne[0]/QK_K;
|
||||
int nbad = 0;
|
||||
for (int row = 0; row < ggml_nrows(tensor); ++row) {
|
||||
auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row);
|
||||
nbad += check_row_for_blocks_256_fp16(nblock, x);
|
||||
}
|
||||
if (nbad > 0) {
|
||||
fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__,
|
||||
nbad, ggml_nrows(tensor)*nblock, tensor->name);
|
||||
if (tensor->ne[2] > 1) {
|
||||
int nb = tensor->ne[0]/QK_K;
|
||||
for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) {
|
||||
int nbad_expert = 0;
|
||||
auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]);
|
||||
for (int64_t i01 = 0; i01 < tensor->ne[1]; ++i01) {
|
||||
auto xr = (const Block *)(xex + i01*tensor->nb[1]);
|
||||
nbad_expert += check_row_for_blocks_256_fp16(nb, xr);
|
||||
}
|
||||
if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %ld\n", nbad_expert, i02);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
template <typename Block>
|
||||
inline int check_row_for_blocks_256_fp16(int nblock, const Block * x, int nr) {
|
||||
int nbad = 0;
|
||||
for (int ib = 0; ib < nblock; ++ib) {
|
||||
for (int j = 0; j < nr; ++j) {
|
||||
if (!isfinite(GGML_FP16_TO_FP32(x[ib].d[j]))) ++nbad;
|
||||
}
|
||||
}
|
||||
return nbad;
|
||||
}
|
||||
template <typename Block, int nr>
|
||||
bool check_tensor_for_blocks_256_fp16_repacked(const ggml_tensor * tensor) {
|
||||
int nblock = tensor->ne[0]/QK_K;
|
||||
int nbad = 0;
|
||||
for (int row = 0; row < ggml_nrows(tensor); row += nr) {
|
||||
auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row);
|
||||
nbad += check_row_for_blocks_256_fp16(nblock, x, nr);
|
||||
}
|
||||
if (nbad > 0) {
|
||||
fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__,
|
||||
nbad, ggml_nrows(tensor)*nblock, tensor->name);
|
||||
if (tensor->ne[2] > 1) {
|
||||
int nb = tensor->ne[0]/QK_K;
|
||||
for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) {
|
||||
int nbad_expert = 0;
|
||||
auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]);
|
||||
for (int64_t i01 = 0; i01 < tensor->ne[1]; i01 += nr) {
|
||||
auto xr = (const Block *)(xex + i01*tensor->nb[1]);
|
||||
nbad_expert += check_row_for_blocks_256_fp16(nb, xr, nr);
|
||||
}
|
||||
if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %ld\n", nbad_expert, i02);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
struct F32Scale {
|
||||
static inline int check_row(const char * data) {
|
||||
float d = *(const float *)data;
|
||||
return isfinite(d) ? 0 : 1;
|
||||
}
|
||||
};
|
||||
struct F16Scale {
|
||||
static inline int check_row(const char * data) {
|
||||
float d = GGML_FP16_TO_FP32(*(const ggml_half *)data);
|
||||
return isfinite(d) ? 0 : 1;
|
||||
}
|
||||
};
|
||||
template <int nr>
|
||||
struct F32ScaleRX {
|
||||
static inline int check_row(const char * data) {
|
||||
auto d = (const float *)data;
|
||||
int nbad = 0;
|
||||
for (int i = 0; i < nr; ++i) {
|
||||
if (!isfinite(d[i])) ++nbad;
|
||||
}
|
||||
return nbad;
|
||||
}
|
||||
};
|
||||
template <int nr>
|
||||
struct F16ScaleRX {
|
||||
static inline int check_row(const char * data) {
|
||||
auto d = (const ggml_half *)data;
|
||||
int nbad = 0;
|
||||
for (int i = 0; i < nr; ++i) {
|
||||
if (!isfinite(GGML_FP16_TO_FP32(d[i]))) ++nbad;
|
||||
}
|
||||
return nbad;
|
||||
}
|
||||
};
|
||||
template <typename RS>
|
||||
bool check_tensor_row_scales(const ggml_tensor * tensor) {
|
||||
auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
|
||||
int num_rows = ggml_nrows(tensor);
|
||||
auto data = (const char *)tensor->data;
|
||||
int nbad = 0;
|
||||
for (int row = 0; row < num_rows; ++row) {
|
||||
nbad += RS::check_row(data);
|
||||
data += row_size;
|
||||
}
|
||||
if (nbad > 0) {
|
||||
fprintf(stderr, "%s: found %d NaN row scales out of %d rows in tensor %s\n", __func__,
|
||||
nbad, num_rows, tensor->name);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool iqk_validate_tensor(const ggml_tensor * tensor) {
|
||||
if (!tensor) return true;
|
||||
if (!ggml_is_contiguous(tensor)) return true;
|
||||
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_IQ2_K: return check_tensor_for_blocks_256_fp16<block_iq2_k>(tensor);
|
||||
case GGML_TYPE_IQ3_K: return check_tensor_for_blocks_256_fp16<block_iq3_k>(tensor);
|
||||
case GGML_TYPE_IQ4_K: return check_tensor_for_blocks_256_fp16<block_iq4_k>(tensor);
|
||||
case GGML_TYPE_IQ5_K: return check_tensor_for_blocks_256_fp16<block_iq5_k>(tensor);
|
||||
case GGML_TYPE_IQ6_K: return check_tensor_for_blocks_256_fp16<block_iq6_k>(tensor);
|
||||
case GGML_TYPE_IQ2_XXS: return check_tensor_for_blocks_256_fp16<block_iq2_xxs>(tensor);
|
||||
case GGML_TYPE_IQ2_XS: return check_tensor_for_blocks_256_fp16<block_iq2_xs>(tensor);
|
||||
case GGML_TYPE_IQ2_S: return check_tensor_for_blocks_256_fp16<block_iq2_s>(tensor);
|
||||
case GGML_TYPE_IQ3_XXS: return check_tensor_for_blocks_256_fp16<block_iq3_xxs>(tensor);
|
||||
case GGML_TYPE_IQ3_S: return check_tensor_for_blocks_256_fp16<block_iq3_s>(tensor);
|
||||
case GGML_TYPE_IQ4_XS: return check_tensor_for_blocks_256_fp16<block_iq4_xs>(tensor);
|
||||
case GGML_TYPE_IQ2_K_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq2_k_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ3_K_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq3_k_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ4_K_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq4_k_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ5_K_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq5_k_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ2_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq2_xxs_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ2_XS_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq2_xs_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ2_S_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq2_s_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ3_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq3_xxs_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ3_S_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq3_s_r4, 4>(tensor);
|
||||
case GGML_TYPE_IQ4_XS_R8: return check_tensor_for_blocks_256_fp16_repacked<block_iq4_xs_r8, 8>(tensor);
|
||||
case GGML_TYPE_IQ2_BN:
|
||||
case GGML_TYPE_IQ4_KSS:
|
||||
case GGML_TYPE_IQ4_KS:
|
||||
case GGML_TYPE_IQ5_KS: return check_tensor_row_scales<F32Scale>(tensor);
|
||||
case GGML_TYPE_IQ2_BN_R4:
|
||||
case GGML_TYPE_IQ4_KS_R4:
|
||||
case GGML_TYPE_IQ5_KS_R4: return check_tensor_row_scales<F32ScaleRX<4>>(tensor);
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
case GGML_TYPE_IQ2_KS:
|
||||
case GGML_TYPE_IQ2_KL:
|
||||
case GGML_TYPE_IQ3_KS: return check_tensor_row_scales<F16Scale>(tensor);
|
||||
case GGML_TYPE_IQ1_S_R4:
|
||||
case GGML_TYPE_IQ1_M_R4: return check_tensor_row_scales<F16ScaleRX<4>>(tensor);
|
||||
|
||||
default: break;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -326,6 +326,8 @@ void iqk_quantize_any(int from_type, int to_type,
|
||||
const void * GGML_RESTRICT x, void * GGML_RESTRICT y, void * work_buffer,
|
||||
to_float_t to_float, from_float_t from_float, int ith, int nth);
|
||||
|
||||
bool iqk_validate_tensor(const struct ggml_tensor * src);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user