mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-20 13:14:09 +00:00
Better model info (#84)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <array>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -227,6 +228,31 @@ static void test_roundtrip_on_layer(
|
||||
}
|
||||
}
|
||||
|
||||
static void print_fp_stats(const char * msg, const uint64_t * counts) {
|
||||
printf("===== %s\n", msg);
|
||||
uint64_t tot = 0; for (int i = 0; i < 32; ++i) tot += counts[i];
|
||||
double norm = 1./tot;
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
if (!counts[i]) continue;
|
||||
uint16_t val = i << 10;
|
||||
float f = ggml_fp16_to_fp32(val);
|
||||
printf("%2d %f %g\n", i, norm*counts[i], f);
|
||||
}
|
||||
}
|
||||
|
||||
static void analyze_tensor_fp(const ggml_tensor * t, uint64_t * H) {
|
||||
if (t->type != GGML_TYPE_F16) return;
|
||||
if (!ggml_is_contiguous(t)) return;
|
||||
int n = ggml_nelements(t);
|
||||
const uint16_t * x = (const uint16_t *)t->data;
|
||||
std::array<uint64_t, 32> counts = {};
|
||||
for (int j = 0; j < n; ++j) {
|
||||
++counts[(x[j] >> 10) & 31];
|
||||
}
|
||||
for (int i = 0; i < 32; ++i) H[i] += counts[i];
|
||||
print_fp_stats(t->name, counts.data());
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
@@ -236,6 +262,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int max_thread = 0;
|
||||
bool invalid_param = false;
|
||||
bool analyze_fp = false;
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
@@ -249,6 +276,8 @@ int main(int argc, char ** argv) {
|
||||
params.verbose = true;
|
||||
} else if (arg == "-p" || arg == "--per-layer-stats") {
|
||||
params.per_layer_stats = true;
|
||||
} else if (arg == "-afp" || arg == "--analyze-fp") {
|
||||
analyze_fp = true;
|
||||
} else if (arg == "--histogram") {
|
||||
params.print_histogram = true;
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
@@ -375,6 +404,22 @@ int main(int argc, char ** argv) {
|
||||
std::vector<char> quantized_scratch;
|
||||
std::vector<float> output_scratch;
|
||||
|
||||
if (analyze_fp) {
|
||||
for (const auto& kv_tensor : tensors) {
|
||||
if (!layer_included(params, kv_tensor.first)) {
|
||||
continue;
|
||||
}
|
||||
if (kv_tensor.second->ne[0] == 1 || kv_tensor.second->ne[1] == 1) {
|
||||
// we never quantize those
|
||||
continue;
|
||||
}
|
||||
std::array<uint64_t, 32> H = {};
|
||||
analyze_tensor_fp(kv_tensor.second, H.data());
|
||||
print_fp_stats("Total", H.data());
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// loop throught quantization types
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
const ggml_type type = (ggml_type) i;
|
||||
|
||||
@@ -5882,18 +5882,40 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
||||
if (ml.n_elements >= 1e12) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
|
||||
LLAMA_LOG_INFO("%s: model params = %.3f T\n", __func__, ml.n_elements*1e-12);
|
||||
} else if (ml.n_elements >= 1e9) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||
LLAMA_LOG_INFO("%s: model params = %.3f B\n", __func__, ml.n_elements*1e-9);
|
||||
} else if (ml.n_elements >= 1e6) {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
|
||||
LLAMA_LOG_INFO("%s: model params = %.3f M\n", __func__, ml.n_elements*1e-6);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
|
||||
LLAMA_LOG_INFO("%s: model params = %.3f K\n", __func__, ml.n_elements*1e-3);
|
||||
}
|
||||
if (ml.n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
LLAMA_LOG_INFO("%s: model size = %.3f MiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
LLAMA_LOG_INFO("%s: model size = %.3f GiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
}
|
||||
{
|
||||
auto n_bytes = ml.n_bytes;
|
||||
auto n_elements = ml.n_elements;
|
||||
auto meta_tke = ml.get_tensor_meta("token_embd.weight");
|
||||
auto meta_out = ml.get_tensor_meta("output.weight");
|
||||
if (meta_tke && meta_out) {
|
||||
n_bytes -= ggml_nbytes(meta_tke);
|
||||
n_elements -= ggml_nelements(meta_tke);
|
||||
n_bytes -= ggml_nbytes(meta_out);
|
||||
n_elements -= ggml_nelements(meta_out);
|
||||
if (n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: repeating layers = %.3f MiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: repeating layers = %.3f GiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
}
|
||||
if (ml.n_elements >= 1e9) {
|
||||
printf(", %.3f B parameters)\n", n_elements*1e-9);
|
||||
} else {
|
||||
printf(", %.3f M parameters)\n", n_elements*1e-6);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// general kv
|
||||
|
||||
Reference in New Issue
Block a user