mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-12 06:50:08 +00:00
Merge branch 'main' into andrewkchan/try_trellis
This commit is contained in:
@@ -51,5 +51,6 @@ else()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(sweep-bench)
|
||||
add_subdirectory(tokenize)
|
||||
endif()
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
#include <fstream>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -39,6 +41,7 @@ struct Stats {
|
||||
std::vector<float> values;
|
||||
std::vector<int> counts;
|
||||
int ncall = 0;
|
||||
int n_as = 1;
|
||||
};
|
||||
|
||||
class IMatrixCollector {
|
||||
@@ -48,13 +51,59 @@ public:
|
||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
void save_imatrix(int ncall = -1) const;
|
||||
bool load_imatrix(const char * file_name);
|
||||
void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; }
|
||||
void print_layer_importance();
|
||||
private:
|
||||
std::unordered_map<std::string, Stats> m_stats;
|
||||
gpt_params m_params;
|
||||
std::mutex m_mutex;
|
||||
int m_last_call = 0;
|
||||
int m_last_layer = 9999;
|
||||
int m_last_ffn = -1;
|
||||
std::vector<float> m_src1_data;
|
||||
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
||||
std::vector<float> m_last_input;
|
||||
std::vector<float> m_ffn_input;
|
||||
std::vector<std::pair<double,int>> m_layer_sim;
|
||||
std::vector<std::pair<double,int>> m_attn_sim;
|
||||
std::vector<std::pair<double,int>> m_ffn_sim;
|
||||
bool m_collect_lsim = false;
|
||||
|
||||
std::optional<int> layer_index(const std::string& name) const {
|
||||
if (name == m_params.output_tensor_name && m_last_layer < 199) {
|
||||
return m_last_layer + 1;
|
||||
}
|
||||
if (auto pos = name.find("blk."); pos == 0) {
|
||||
pos += 4;
|
||||
if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) {
|
||||
auto index_str = name.substr(pos, pos1 - pos);
|
||||
std::istringstream str(index_str);
|
||||
int index; str >> index;
|
||||
if (!str.fail()) return index;
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static inline double cosine_similarity(int n, const float * x, const float * y) {
|
||||
double sumxy = 0, sumx2 = 0, sumy2 = 0;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
|
||||
}
|
||||
double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0;
|
||||
return cos_sim;
|
||||
}
|
||||
|
||||
static inline void collect_cos_similarity(int nrow, int n, const float * x, const float * y, std::pair<double, int>& p) {
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
p.first += cosine_similarity(n, x, y);
|
||||
p.second += 1;
|
||||
x += n;
|
||||
y += n;
|
||||
}
|
||||
}
|
||||
|
||||
static void print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim);
|
||||
};
|
||||
|
||||
// remove any prefix and suffixes from the name
|
||||
@@ -76,6 +125,45 @@ static std::string filter_tensor_name(const char * name) {
|
||||
return wname;
|
||||
}
|
||||
|
||||
void IMatrixCollector::print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim) {
|
||||
if (sim.empty()) return;
|
||||
std::vector<std::pair<float, int>> layers;
|
||||
layers.reserve(sim.size());
|
||||
for (int i = 0; i < int(sim.size()); ++i) {
|
||||
if (sim[i].second > 0) layers.emplace_back(float(std::abs(sim[i].first/sim[i].second)), i);
|
||||
}
|
||||
if (layers.empty()) return;
|
||||
std::sort(layers.begin(), layers.end());
|
||||
printf("%s\n", msg);
|
||||
//printf("======================== sorted layer importances\n");
|
||||
int j = 0;
|
||||
for (auto& p : layers) {
|
||||
int i = p.second;
|
||||
printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, sim[i].first/sim[i].second);
|
||||
}
|
||||
}
|
||||
|
||||
void IMatrixCollector::print_layer_importance() {
|
||||
print_layer_importance("\n======================== sorted layer importances", m_layer_sim);
|
||||
print_layer_importance("\n======================== sorted attention importances", m_attn_sim);
|
||||
print_layer_importance("\n======================== sorted ffn importances", m_ffn_sim);
|
||||
//printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
|
||||
//if (m_layer_sim.empty()) return;
|
||||
//std::vector<std::pair<float, int>> layers;
|
||||
//layers.reserve(m_layer_sim.size());
|
||||
//for (int i = 0; i < int(m_layer_sim.size()); ++i) {
|
||||
// if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
|
||||
//}
|
||||
//if (layers.empty()) return;
|
||||
//std::sort(layers.begin(), layers.end());
|
||||
//printf("======================== sorted layer importances\n");
|
||||
//int j = 0;
|
||||
//for (auto& p : layers) {
|
||||
// int i = p.second;
|
||||
// printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
|
||||
//}
|
||||
}
|
||||
|
||||
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
GGML_UNUSED(user_data);
|
||||
|
||||
@@ -91,7 +179,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
// why are small batches ignored (<16 tokens)?
|
||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||
//printf("wname = %s\n", wname.c_str());
|
||||
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == m_params.output_tensor_name))) return false;
|
||||
if (!(wname.substr(0, 4) == "blk." || ((m_params.process_output || m_collect_lsim) && wname == m_params.output_tensor_name))) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -107,6 +195,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
|
||||
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||
|
||||
if (m_collect_lsim) {
|
||||
if (wname.find(".ffn_") != std::string::npos) {
|
||||
if (auto index = layer_index(wname); index.has_value() && *index == m_last_layer && *index != m_last_ffn) {
|
||||
int n = src1->ne[0];
|
||||
int nrow = t->op == GGML_OP_MUL_MAT_ID ? src1->ne[2] : src1->ne[1];
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||
GGML_ASSERT(src1->ne[1] == 1);
|
||||
}
|
||||
if (m_ffn_input.empty()) {
|
||||
m_ffn_input.resize(nrow*n);
|
||||
} else {
|
||||
if ((int)m_ffn_input.size() != nrow*n) {
|
||||
printf("Oops, inconsistent ffn size\n"); exit(1);
|
||||
}
|
||||
}
|
||||
std::memcpy(m_ffn_input.data(), data, nrow*n*sizeof(float));
|
||||
if (m_ffn_input.size() != m_last_input.size()) {
|
||||
printf("Oops, inconsistent ffn vs last_input size\n"); exit(1);
|
||||
}
|
||||
if (m_attn_sim.size() < *index + 1) m_attn_sim.resize(*index + 1);
|
||||
auto& p = m_attn_sim[*index];
|
||||
collect_cos_similarity(nrow, n, m_ffn_input.data(), m_last_input.data(), p);
|
||||
m_last_ffn = *index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||
@@ -132,11 +247,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0]*n_as, 0);
|
||||
e.counts.resize(src1->ne[0]*n_as, 0);
|
||||
e.n_as = n_as;
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||
exit(1); //GGML_ABORT("fatal error");
|
||||
}
|
||||
else if (e.n_as != n_as) {
|
||||
fprintf(stderr, "Oops: inconsistent n_as for %s (%d vs %d)\n", wname.c_str(), e.n_as, n_as);
|
||||
}
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
||||
}
|
||||
@@ -177,6 +296,39 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (m_collect_lsim) {
|
||||
// We only need to do it here and not in the MoE branch above because the first tensor in a layer
|
||||
// never is a MoE tensor
|
||||
if (auto index = layer_index(wname); index.has_value()) {
|
||||
if (*index != m_last_layer) {
|
||||
if (*index > 0) {
|
||||
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
|
||||
printf("Oops: different size (%d vs %d). Tensor name was %s, m_last_layer = %d\n",
|
||||
(int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size(), src0->name, m_last_layer);
|
||||
exit(1);
|
||||
}
|
||||
if (*index > m_layer_sim.size()) m_layer_sim.resize(*index);
|
||||
auto& p = m_layer_sim[*index - 1];
|
||||
collect_cos_similarity(src1->ne[1], src1->ne[0], m_last_input.data(), (const float *)data, p);
|
||||
if (*index == m_last_ffn + 1) {
|
||||
if (*index > m_ffn_sim.size()) m_ffn_sim.resize(*index);
|
||||
auto& p1 = m_ffn_sim[*index-1];
|
||||
collect_cos_similarity(src1->ne[1], src1->ne[0], m_ffn_input.data(), (const float *)data, p1);
|
||||
}
|
||||
}
|
||||
m_last_layer = *index;
|
||||
if (m_last_input.empty()) {
|
||||
m_last_input.resize(src1->ne[0]*src1->ne[1]);
|
||||
} else {
|
||||
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
|
||||
printf("Oops\n"); exit(1);
|
||||
}
|
||||
}
|
||||
//printf("Copying src1 to m_last_input\n");
|
||||
std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
auto & e = m_stats[wname];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
@@ -190,7 +342,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) {
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
@@ -258,8 +410,38 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
||||
}
|
||||
|
||||
if (n_zeros > 0) {
|
||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||
continue;
|
||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%)", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||
bool store_it = false;
|
||||
if (kv.second.n_as > 1) {
|
||||
int n_per_expert = n_all / kv.second.n_as;
|
||||
std::vector<int> bad_experts;
|
||||
bad_experts.reserve(kv.second.n_as);
|
||||
for (int i = 0; i < kv.second.n_as; ++i) {
|
||||
auto counts = kv.second.counts.data() + i*n_per_expert;
|
||||
int nz_i = 0;
|
||||
for (int j = 0; j < n_per_expert; ++j) {
|
||||
if (counts[j] == 0) ++nz_i;
|
||||
}
|
||||
if (nz_i > 0) bad_experts.push_back(i);
|
||||
}
|
||||
fprintf(stderr, " %d out of %d experts are missing data", int(bad_experts.size()), kv.second.n_as);
|
||||
if (bad_experts.size() < round(kv.second.n_as * 0.05)) {
|
||||
fprintf(stderr, " Storing **but be aware**\n");
|
||||
store_it = true;
|
||||
for (auto i : bad_experts) {
|
||||
auto counts = (int *)kv.second.counts.data() + i*n_per_expert;
|
||||
auto values = (float *)kv.second.values.data() + i*n_per_expert;
|
||||
for (int j = 0; j < n_per_expert; ++j) {
|
||||
counts[j] = 1;
|
||||
values[j] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!store_it) {
|
||||
fprintf(stderr, " - skipping\n");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
n_entries++;
|
||||
@@ -587,7 +769,25 @@ int main(int argc, char ** argv) {
|
||||
params.logits_all = true;
|
||||
params.verbosity = 1;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
bool lsim = false;
|
||||
//
|
||||
// Do not pollute common with totally imatrix specific arguments as it was done in mainline.
|
||||
// Instead, parse imatrix specific args here, push unknown args into a new array of args,
|
||||
// and pass that to gpt_params_parse().
|
||||
//
|
||||
std::vector<char*> args;
|
||||
args.reserve(argc);
|
||||
args.push_back(argv[0]);
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg{argv[i]};
|
||||
if (arg == "-lsim" || arg == "--layer-similarity") {
|
||||
lsim = true;
|
||||
} else {
|
||||
args.push_back(argv[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
@@ -595,6 +795,7 @@ int main(int argc, char ** argv) {
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
g_collector.set_params(params);
|
||||
g_collector.set_collect_lsim(lsim);
|
||||
|
||||
for (const auto & in_file : params.in_files) {
|
||||
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
||||
@@ -645,6 +846,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
g_collector.save_imatrix();
|
||||
g_collector.print_layer_importance();
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
//
|
||||
// Copyright (C) 2023-2025 The llama.cpp authors
|
||||
// Copyright (C) 2024-2025 Iwan Kawrakow
|
||||
// MIT license
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
@@ -41,6 +48,12 @@ static uint64_t get_time_ns() {
|
||||
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
std::ostream& operator<<(std::ostream& str, const std::pair<T1, T2>& item) {
|
||||
str << '{' << item.first << ", " << item.second << '}';
|
||||
return str;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
static std::string join(const std::vector<T> & values, const std::string & delim) {
|
||||
std::ostringstream str;
|
||||
@@ -215,6 +228,9 @@ static std::string pair_str(const std::pair<int, int> & p) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
// Ser = Smart Expert Reduction
|
||||
using Ser = std::pair<int,float>;
|
||||
|
||||
struct cmd_params {
|
||||
std::vector<std::string> model;
|
||||
std::vector<int> n_prompt;
|
||||
@@ -225,21 +241,27 @@ struct cmd_params {
|
||||
std::vector<int> n_ubatch;
|
||||
std::vector<ggml_type> type_k;
|
||||
std::vector<ggml_type> type_v;
|
||||
std::vector<int> n_threads;
|
||||
std::vector<std::pair<int,int>> n_threads;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
std::vector<int> main_gpu;
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> flash_attn;
|
||||
std::vector<int> mla_attn;
|
||||
std::vector<int> attn_max_batch;
|
||||
std::vector<Ser> ser;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<bool> use_mmap;
|
||||
std::vector<bool> embeddings;
|
||||
std::vector<llama_model_tensor_buft_override> buft_overrides;
|
||||
ggml_numa_strategy numa;
|
||||
int reps;
|
||||
bool verbose;
|
||||
bool warmup;
|
||||
bool repack = false;
|
||||
bool fmoe = false;
|
||||
bool use_thp = false;
|
||||
output_formats output_format;
|
||||
output_formats output_format_stderr;
|
||||
};
|
||||
@@ -254,21 +276,27 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* n_ubatch */ {512},
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* n_threads */ {{cpu_get_num_math(), cpu_get_num_math()}},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* flash_attn */ {false},
|
||||
/* mla_attn */ {0},
|
||||
/* attn_max_batch */ {0},
|
||||
/* ser */ {{-1,0.0f}},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* buft_overrides */ {},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* warmup */ true,
|
||||
/* repack */ false,
|
||||
/* use_thp */ false,
|
||||
/* fmoe */ false,
|
||||
/* output_format */ MARKDOWN,
|
||||
/* output_format_stderr */ NONE,
|
||||
};
|
||||
@@ -288,12 +316,16 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -tgb, --threads-gen-batch <n1,n2> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -mla, --mla-attn <0|1|2> (default: %s)\n", join(cmd_params_defaults.mla_attn, ",").c_str());
|
||||
printf(" -amb, --attn-max-batch <i> (default: %s)\n", join(cmd_params_defaults.attn_max_batch, ",").c_str());
|
||||
printf(" -ser, --smart-expert-reduction <i,f>(default: %s)\n", join(cmd_params_defaults.attn_max_batch, ",").c_str());
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
@@ -304,6 +336,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
printf(" -w, --warmup <0|1> (default: %s)\n", cmd_params_defaults.warmup ? "1" : "0");
|
||||
printf(" -rtr, --run-time-repack <0|1> (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
|
||||
printf(" -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0");
|
||||
printf(" -ot, --override-tensor pattern (default: none)\n");
|
||||
printf(" -fmoe, --fused-moe <0|1> (default: %s)\n", cmd_params_defaults.fmoe? "1" : "0");
|
||||
printf("\n");
|
||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||
}
|
||||
@@ -336,10 +371,68 @@ static ggml_type ggml_type_from_name(const std::string & s) {
|
||||
if (s == "q6_0") {
|
||||
return GGML_TYPE_Q6_0;
|
||||
}
|
||||
if (s == "q8_KV") {
|
||||
return GGML_TYPE_Q8_KV;
|
||||
}
|
||||
|
||||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
namespace {
|
||||
bool parse_buft_overrides(const std::string& value, std::vector<llama_model_tensor_buft_override>& overrides) {
|
||||
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
if (buft_list.empty()) {
|
||||
// enumerate all the devices and add their buffer types to the list
|
||||
for (size_t i = 0; i < ggml_backend_reg_get_count(); ++i) {
|
||||
//auto * dev = ggml_backend_reg_get_name(i);
|
||||
auto * buft = ggml_backend_reg_get_default_buffer_type(i);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const auto & override : string_split<std::string>(value, ',')) {
|
||||
std::string::size_type pos = override.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
fprintf(stderr, "Invalid buft override argument %s\n", value.c_str());
|
||||
return false;
|
||||
}
|
||||
std::string tensor_name = override.substr(0, pos);
|
||||
std::string buffer_type = override.substr(pos + 1);
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
fprintf(stderr, "Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
fprintf(stderr, " %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
|
||||
}
|
||||
return true;
|
||||
}
|
||||
template<class T1, class T2>
|
||||
std::vector<std::pair<T1,T2>> string_split_pairs(const std::string & str, char delim) {
|
||||
std::vector<std::pair<T1,T2>> values;
|
||||
std::istringstream str_stream(str);
|
||||
std::string token;
|
||||
T1 first_value;
|
||||
int i = 0;
|
||||
while (std::getline(str_stream, token, delim)) {
|
||||
std::istringstream token_stream(token);
|
||||
if (i%2 == 0) {
|
||||
token_stream >> first_value;
|
||||
if (token_stream.fail()) return {};
|
||||
} else {
|
||||
T2 value;
|
||||
token_stream >> value;
|
||||
if (token_stream.fail()) return {};
|
||||
values.emplace_back(first_value, value);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return values;
|
||||
}
|
||||
}
|
||||
|
||||
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
cmd_params params;
|
||||
@@ -459,7 +552,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||
params.n_threads.reserve(params.n_threads.size() + p.size());
|
||||
for (auto t : p) params.n_threads.push_back({t, t});
|
||||
//params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||
} else if (arg == "-tgb" || arg == "--threads-gen-batch") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto ps = string_split<std::string>(argv[i], ';');
|
||||
for (auto& s : ps) {
|
||||
auto p = string_split<int>(s.c_str(), ',');
|
||||
if (p.size() != 2) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads.push_back({p[0], p[1]});
|
||||
}
|
||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -526,6 +635,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
||||
} else if (arg == "-mla" || arg == "--mla-attn") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.mla_attn.insert(params.mla_attn.end(), p.begin(), p.end());
|
||||
} else if (arg == "-amb" || arg == "--attn-max-batch") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.attn_max_batch.insert(params.attn_max_batch.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ser" || arg == "--smart-expert-reduction") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split_pairs<int,float>(argv[i], split_delim);
|
||||
params.ser.insert(params.ser.end(), p.begin(), p.end());
|
||||
} else if (arg == "-mmp" || arg == "--mmap") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -594,6 +724,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.repack = std::stoi(argv[i]);
|
||||
} else if (arg == "-thp" || arg == "--transparent-huge-pages") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.use_thp = std::stoi(argv[i]);
|
||||
} else if (arg == "-fmoe" || arg == "--fused-moe") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.fmoe = std::stoi(argv[i]);
|
||||
} else if (arg == "-ot" || arg == "--override-tensor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (!parse_buft_overrides(std::string{argv[i]}, params.buft_overrides)) {
|
||||
fprintf(stderr, "error: Invalid tensor buffer type override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
@@ -621,10 +773,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
|
||||
if (params.mla_attn.empty()) { params.mla_attn = cmd_params_defaults.mla_attn; }
|
||||
if (params.attn_max_batch.empty()){ params.attn_max_batch = cmd_params_defaults.attn_max_batch; }
|
||||
if (params.ser.empty()) { params.ser = cmd_params_defaults.ser; }
|
||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||
if (!params.buft_overrides.empty()) params.buft_overrides.emplace_back(llama_model_tensor_buft_override{nullptr, nullptr});
|
||||
|
||||
return params;
|
||||
}
|
||||
@@ -649,17 +805,23 @@ struct cmd_params_instance {
|
||||
int n_ubatch;
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
int n_threads;
|
||||
std::pair<int,int> n_threads;
|
||||
int n_gpu_layers;
|
||||
std::string rpc_servers;
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
int mla_attn;
|
||||
int attn_max_batch;
|
||||
Ser ser;
|
||||
std::vector<float> tensor_split;
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
bool repack = false;
|
||||
bool fmoe = false;
|
||||
bool use_thp = false;
|
||||
const llama_model_tensor_buft_override* buft_overrides;
|
||||
|
||||
llama_model_params to_llama_mparams() const {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
@@ -673,6 +835,8 @@ struct cmd_params_instance {
|
||||
mparams.tensor_split = tensor_split.data();
|
||||
mparams.use_mmap = use_mmap;
|
||||
mparams.repack_tensors = repack;
|
||||
mparams.use_thp = use_thp;
|
||||
mparams.tensor_buft_overrides = buft_overrides;
|
||||
|
||||
return mparams;
|
||||
}
|
||||
@@ -685,6 +849,7 @@ struct cmd_params_instance {
|
||||
main_gpu == other.main_gpu &&
|
||||
use_mmap == other.use_mmap &&
|
||||
repack == other.repack &&
|
||||
use_thp == other.use_thp &&
|
||||
tensor_split == other.tensor_split;
|
||||
}
|
||||
|
||||
@@ -698,6 +863,11 @@ struct cmd_params_instance {
|
||||
cparams.type_v = type_v;
|
||||
cparams.offload_kqv = !no_kv_offload;
|
||||
cparams.flash_attn = flash_attn;
|
||||
cparams.mla_attn = mla_attn;
|
||||
cparams.attn_max_batch = attn_max_batch;
|
||||
cparams.fused_moe_up_gate = fmoe;
|
||||
cparams.min_experts = ser.first;
|
||||
cparams.thresh_experts = ser.second;
|
||||
cparams.embeddings = embeddings;
|
||||
|
||||
return cparams;
|
||||
@@ -722,6 +892,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & tv : params.type_v)
|
||||
for (const auto & nkvo : params.no_kv_offload)
|
||||
for (const auto & fa : params.flash_attn)
|
||||
for (const auto & mla : params.mla_attn)
|
||||
for (const auto & amb : params.attn_max_batch)
|
||||
for (const auto & ser : params.ser)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
@@ -743,10 +916,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .mla_attn = */ mla,
|
||||
/* .attn_max_b = */ amb,
|
||||
/* .ser = */ ser,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .repack = */ params.repack,
|
||||
/* .fmoe = */ params.fmoe,
|
||||
/* .use_thp = */ params.use_thp,
|
||||
/* .buft_overrides=*/ params.buft_overrides.data(),
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -771,10 +950,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .mla_attn = */ mla,
|
||||
/* .attn_max_b = */ amb,
|
||||
/* .ser = */ ser,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .repack = */ params.repack,
|
||||
/* .fmoe = */ params.fmoe,
|
||||
/* .use_thp = */ params.use_thp,
|
||||
/* .buft_overrides=*/ params.buft_overrides.data(),
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -799,10 +984,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .mla_attn = */ mla,
|
||||
/* .attn_max_b = */ amb,
|
||||
/* .ser = */ ser,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .repack = */ params.repack,
|
||||
/* .fmoe = */ params.fmoe,
|
||||
/* .use_thp = */ params.use_thp,
|
||||
/* .buft_overrides=*/ params.buft_overrides.data(),
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -827,10 +1018,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .main_gpu = */ mg,
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .flash_attn = */ fa,
|
||||
/* .mla_attn = */ mla,
|
||||
/* .attn_max_b = */ amb,
|
||||
/* .ser = */ ser,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
/* .embeddings = */ embd,
|
||||
/* .repack = */ params.repack,
|
||||
/* .fmoe = */ params.fmoe,
|
||||
/* .use_thp = */ params.use_thp,
|
||||
/* .buft_overrides=*/ params.buft_overrides.data(),
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -857,7 +1054,7 @@ struct test {
|
||||
uint64_t model_n_params;
|
||||
int n_batch;
|
||||
int n_ubatch;
|
||||
int n_threads;
|
||||
std::pair<int,int> n_threads;
|
||||
bool has_rpc;
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
@@ -866,10 +1063,15 @@ struct test {
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
int mla_attn;
|
||||
int attn_max_batch;
|
||||
Ser ser;
|
||||
std::vector<float> tensor_split;
|
||||
bool use_mmap;
|
||||
bool embeddings;
|
||||
bool repack = false;
|
||||
bool fmoe = false;
|
||||
bool use_thp = false;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
std::string test_time;
|
||||
@@ -895,10 +1097,15 @@ struct test {
|
||||
main_gpu = inst.main_gpu;
|
||||
no_kv_offload = inst.no_kv_offload;
|
||||
flash_attn = inst.flash_attn;
|
||||
mla_attn = inst.mla_attn;
|
||||
attn_max_batch = inst.attn_max_batch;
|
||||
ser = inst.ser;
|
||||
tensor_split = inst.tensor_split;
|
||||
use_mmap = inst.use_mmap;
|
||||
embeddings = inst.embeddings;
|
||||
repack = inst.repack;
|
||||
fmoe = inst.fmoe;
|
||||
use_thp = inst.use_thp;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
test_kind = inst.test_kind;
|
||||
@@ -988,8 +1195,8 @@ struct test {
|
||||
"n_batch", "n_ubatch",
|
||||
"n_threads", "type_k", "type_v",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn",
|
||||
"tensor_split", "use_mmap", "embeddings", "repack",
|
||||
"main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser",
|
||||
"tensor_split", "use_mmap", "embeddings", "repack", "fused_moe", "use_thp",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts", "test",
|
||||
@@ -1004,13 +1211,14 @@ struct test {
|
||||
field == "n_threads" ||
|
||||
field == "model_size" || field == "model_n_params" ||
|
||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||
field == "n_prompt" || field == "n_gen" ||
|
||||
field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" ||
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack") {
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" ||
|
||||
field == "fused_moe") {
|
||||
return BOOL;
|
||||
}
|
||||
if (field == "avg_ts" || field == "stddev_ts") {
|
||||
@@ -1035,6 +1243,12 @@ struct test {
|
||||
tensor_split_str += "/";
|
||||
}
|
||||
}
|
||||
auto ser_to_string = [] (const Ser& ser) {
|
||||
std::ostringstream str;
|
||||
str << ser.first << ',' << ser.second;
|
||||
return str.str();
|
||||
};
|
||||
bool is_gen = n_gen > 0;
|
||||
std::vector<std::string> values = {
|
||||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
||||
@@ -1042,10 +1256,12 @@ struct test {
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(is_gen ? n_threads.first : n_threads.second), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(repack),
|
||||
std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser),
|
||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||
std::to_string(repack), std::to_string(fmoe), std::to_string(use_thp),
|
||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||
std::to_string(avg_ts()), std::to_string(stdev_ts()),
|
||||
@@ -1208,12 +1424,27 @@ struct markdown_printer : public printer {
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
}
|
||||
if (field == "mla_attn") {
|
||||
return 3;
|
||||
}
|
||||
if (field == "attn_max_batch") {
|
||||
return 5;
|
||||
}
|
||||
if (field == "ser") {
|
||||
return 10;
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return 4;
|
||||
}
|
||||
if (field == "repack") {
|
||||
return 3;
|
||||
}
|
||||
if (field == "use_thp") {
|
||||
return 3;
|
||||
}
|
||||
if (field == "fused_moe") {
|
||||
return 4;
|
||||
}
|
||||
if (field == "test") {
|
||||
return 13;
|
||||
}
|
||||
@@ -1242,12 +1473,27 @@ struct markdown_printer : public printer {
|
||||
if (field == "flash_attn") {
|
||||
return "fa";
|
||||
}
|
||||
if (field == "mla_attn") {
|
||||
return "mla";
|
||||
}
|
||||
if (field == "attn_max_batch") {
|
||||
return "amb";
|
||||
}
|
||||
if (field == "attn_max_batch") {
|
||||
return "ser";
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return "mmap";
|
||||
}
|
||||
if (field == "repack") {
|
||||
return "rtr";
|
||||
}
|
||||
if (field == "use_thp") {
|
||||
return "thp";
|
||||
}
|
||||
if (field == "fused_moe") {
|
||||
return "fmoe";
|
||||
}
|
||||
if (field == "embeddings") {
|
||||
return "embd";
|
||||
}
|
||||
@@ -1294,6 +1540,15 @@ struct markdown_printer : public printer {
|
||||
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
||||
fields.emplace_back("flash_attn");
|
||||
}
|
||||
if (params.mla_attn.size() > 1 || params.mla_attn != cmd_params_defaults.mla_attn) {
|
||||
fields.emplace_back("mla_attn");
|
||||
}
|
||||
if (params.attn_max_batch.size() > 1 || params.attn_max_batch != cmd_params_defaults.mla_attn) {
|
||||
fields.emplace_back("attn_max_batch");
|
||||
}
|
||||
if (params.ser.size() > 1 || params.ser != cmd_params_defaults.ser) {
|
||||
fields.emplace_back("ser");
|
||||
}
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.emplace_back("tensor_split");
|
||||
}
|
||||
@@ -1306,6 +1561,12 @@ struct markdown_printer : public printer {
|
||||
if (params.repack != cmd_params_defaults.repack) {
|
||||
fields.emplace_back("repack");
|
||||
}
|
||||
if (params.use_thp != cmd_params_defaults.use_thp) {
|
||||
fields.emplace_back("use_thp");
|
||||
}
|
||||
if (params.fmoe != cmd_params_defaults.fmoe) {
|
||||
fields.emplace_back("fused_moe");
|
||||
}
|
||||
fields.emplace_back("test");
|
||||
fields.emplace_back("t/s");
|
||||
|
||||
@@ -1557,10 +1818,10 @@ int main(int argc, char ** argv) {
|
||||
if (params.warmup) {
|
||||
if (t.n_prompt > 0) {
|
||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||
test_prompt(ctx, 1, 0, t.n_batch, t.n_threads.second);
|
||||
}
|
||||
if (t.n_gen > 0) {
|
||||
test_gen(ctx, 1, 0, t.n_threads);
|
||||
test_gen(ctx, 1, 0, t.n_threads.first);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1570,11 +1831,11 @@ int main(int argc, char ** argv) {
|
||||
uint64_t t_start = get_time_ns();
|
||||
|
||||
if (t.n_prompt > 0) {
|
||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads.second);
|
||||
}
|
||||
if (t.test_kind == TEST_KIND_GP) t_start = get_time_ns();
|
||||
if (t.n_gen > 0) {
|
||||
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
|
||||
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads.first);
|
||||
}
|
||||
|
||||
uint64_t t_ns = get_time_ns() - t_start;
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
//
|
||||
// Copyright (C) 2023-2025 The llama.cpp authors
|
||||
// Copyright (C) 2024-2025 Iwan Kawrakow
|
||||
// MIT license
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -126,7 +133,7 @@ static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob
|
||||
max_logit = std::max(max_logit, logits[i]);
|
||||
min_logit = std::min(min_logit, logits[i]);
|
||||
}
|
||||
min_logit = std::max(min_logit, max_logit - 16);
|
||||
min_logit = std::max(min_logit, max_logit - 24);
|
||||
double sum_exp = 0.0;
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
sum_exp += expf(logits[i] - max_logit);
|
||||
@@ -166,7 +173,7 @@ static void process_logits(
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
||||
const results_log_softmax results = log_softmax(n_vocab, logits + int64_t(i)*n_vocab, tokens[i+1]);
|
||||
const double v = -results.log_softmax;
|
||||
local_nll += v;
|
||||
local_nll2 += v*v;
|
||||
@@ -200,7 +207,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
|
||||
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + int64_t(i)*nv, tokens[i+1]);
|
||||
local_nll += v;
|
||||
local_nll2 += v*v;
|
||||
}
|
||||
@@ -618,7 +625,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
|
||||
if (num_batches > 1 && n_outputs > 0) {
|
||||
const auto * batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + int64_t(n_outputs) * n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
//
|
||||
// Copyright (C) 2023-2025 The llama.cpp authors
|
||||
// Copyright (C) 2024-2025 Iwan Kawrakow
|
||||
// MIT license
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
//
|
||||
// Copyright (C) 2023-2025 The llama.cpp authors
|
||||
// Copyright (C) 2024-2025 Iwan Kawrakow
|
||||
// MIT license
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -58,6 +65,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_0_R4", LLAMA_FTYPE_MOSTLY_Q5_0_R4, " 5.50 bpw quantization", },
|
||||
{ "Q6_0_R4", LLAMA_FTYPE_MOSTLY_Q6_0_R4, " 6.50 bpw quantization", },
|
||||
{ "Q8_0_R8", LLAMA_FTYPE_MOSTLY_Q8_0_R8, " 8.50 bpw quantization", },
|
||||
{ "Q8_KV", LLAMA_FTYPE_MOSTLY_Q8_KV, " 8.00 bpw quantization", },
|
||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
||||
{ "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", },
|
||||
{ "IQ4_KS_R4",LLAMA_FTYPE_MOSTLY_IQ4_KS_R4,"IQ4_KS repacked", },
|
||||
@@ -85,6 +93,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K_R4", LLAMA_FTYPE_MOSTLY_Q6_K_R4, "Q6_K repacked", },
|
||||
{ "Q8_K_R8", LLAMA_FTYPE_MOSTLY_Q8_K_R8, "Q8_K repacked", },
|
||||
{ "Q8_KV_R8", LLAMA_FTYPE_MOSTLY_Q8_KV_R8, "Q8_KV repacked", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
@@ -136,15 +145,19 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
|
||||
printf(" --hide-imatrix: do not store imatrix details in the quantized model\n");
|
||||
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
|
||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n");
|
||||
printf(" --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n");
|
||||
printf(" --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n");
|
||||
printf(" --repack-pattern Comma separated list of regexs to use for matching tensor names to be repacked.\n\n");
|
||||
printf("Additional specific tensor quantization types used in the custom quant scheme 'CQS (default is Q2_K):\n");
|
||||
printf(" --attn-q-type ggml_type: use this ggml_type for the attn_q.weight tensor.\n");
|
||||
printf(" --attn-k-type ggml_type: use this ggml_type for the attn_k.weight tensor.\n");
|
||||
@@ -291,6 +304,28 @@ static ggml_type parse_ggml_type(const char * arg) {
|
||||
return result;
|
||||
}
|
||||
|
||||
using CustomQ = std::pair<std::string, ggml_type>;
|
||||
|
||||
static bool parse_custom_quants(const std::string& arg, std::vector<CustomQ>& custom_quants) {
|
||||
for (const auto & item : string_split<std::string>(arg, ',')) {
|
||||
auto pos = item.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
fprintf(stderr, "Invalid custom quantization input %s\n", arg.c_str());
|
||||
return false;
|
||||
}
|
||||
auto pattern = item.substr(0, pos);
|
||||
auto type_as_string = item.substr(pos + 1);
|
||||
auto type = parse_ggml_type(type_as_string.c_str());
|
||||
if (type == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "Invalid quantization type '%s' in custom quantization input %s\n", type_as_string.c_str(), item.c_str());
|
||||
return false;
|
||||
}
|
||||
printf("Adding custom rule %s -> %s\n", pattern.c_str(), ggml_type_name(type));
|
||||
custom_quants.emplace_back(std::move(pattern), type);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
@@ -302,12 +337,26 @@ int main(int argc, char ** argv) {
|
||||
std::string imatrix_file;
|
||||
std::vector<std::string> included_weights, excluded_weights;
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<CustomQ> custom_quants;
|
||||
|
||||
std::vector<std::string> repack_patterns;
|
||||
|
||||
bool hide_imatrix = false;
|
||||
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||
params.quantize_output_tensor = false;
|
||||
} else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
|
||||
params.ignore_imatrix_rules = true;
|
||||
} else if (strcmp(argv[arg_idx], "--repack") == 0) {
|
||||
params.only_repack = true;
|
||||
} else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
auto p = string_split(argv[++arg_idx], ',');
|
||||
repack_patterns.insert(repack_patterns.end(), p.begin(), p.end());
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
|
||||
@@ -372,6 +421,10 @@ int main(int argc, char ** argv) {
|
||||
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--custom-q") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_custom_quants(argv[++arg_idx], custom_quants)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
@@ -382,6 +435,8 @@ int main(int argc, char ** argv) {
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--hide-imatrix") == 0) {
|
||||
hide_imatrix = true;
|
||||
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
included_weights.emplace_back(argv[++arg_idx]);
|
||||
@@ -401,6 +456,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!repack_patterns.empty()) {
|
||||
params.repack_pattern = &repack_patterns;
|
||||
}
|
||||
|
||||
if (argc - arg_idx < 2) {
|
||||
printf("%s: bad arguments\n", argv[0]);
|
||||
usage(argv[0]);
|
||||
@@ -418,7 +477,11 @@ int main(int argc, char ** argv) {
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
|
||||
if (hide_imatrix) {
|
||||
strncpy(kvo.val_str, "top_secret", 127);
|
||||
} else {
|
||||
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
|
||||
}
|
||||
kvo.val_str[127] = '\0';
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
@@ -426,7 +489,11 @@ int main(int argc, char ** argv) {
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
|
||||
if (hide_imatrix) {
|
||||
strncpy(kvo.val_str, "top_secret", 127);
|
||||
} else {
|
||||
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
|
||||
}
|
||||
kvo.val_str[127] = '\0';
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
@@ -435,7 +502,11 @@ int main(int argc, char ** argv) {
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = imatrix_data.size();
|
||||
if (hide_imatrix) {
|
||||
kvo.val_i64 = 0;
|
||||
} else {
|
||||
kvo.val_i64 = imatrix_data.size();
|
||||
}
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
|
||||
@@ -443,7 +514,11 @@ int main(int argc, char ** argv) {
|
||||
llama_model_kv_override kvo;
|
||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = m_last_call;
|
||||
if (hide_imatrix) {
|
||||
kvo.val_i64 = 0;
|
||||
} else {
|
||||
kvo.val_i64 = m_last_call;
|
||||
}
|
||||
kv_overrides.emplace_back(std::move(kvo));
|
||||
}
|
||||
}
|
||||
@@ -452,6 +527,9 @@ int main(int argc, char ** argv) {
|
||||
kv_overrides.back().key[0] = 0;
|
||||
params.kv_overrides = &kv_overrides;
|
||||
}
|
||||
if (!custom_quants.empty()) {
|
||||
params.custom_quants = &custom_quants;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
|
||||
5
examples/sweep-bench/CMakeLists.txt
Normal file
5
examples/sweep-bench/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET llama-sweep-bench)
|
||||
add_executable(${TARGET} sweep-bench.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
65
examples/sweep-bench/README.md
Normal file
65
examples/sweep-bench/README.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# ik_llama.cpp/example/sweep-bench
|
||||
|
||||
Benchmark the prompt processing and token generation performance of `ik_llama.cpp`
|
||||
by doing a sweep over a whole context size and gathering performance metrics
|
||||
in each ubatch-sized window. Only a single token sequence is used.
|
||||
|
||||
The benchmark steps are:
|
||||
|
||||
for each ubatch-sized window in context:
|
||||
|
||||
1. generate ubatch/4 tokens (not the whole window to save some time)
|
||||
2. measure generation performance
|
||||
3. remove generated tokens from KV cache
|
||||
4. prepare a ubatch-sized batch of random tokens
|
||||
4. process prepated batch
|
||||
5. measure prompt processing performance
|
||||
|
||||
The purpose of the benchmark is to visualize how the performance changes with
|
||||
the context size without averaging the metrics values over the whole context.
|
||||
|
||||
## Usage
|
||||
|
||||
./llama-sweep-bench -c 8704 -ub 512 -m models/Meta-Llama-3.2-3B-Instruct-Q8_0.gguf
|
||||
|
||||
## Sample results
|
||||
|
||||
- `PP` - prompt tokens per ubatch
|
||||
- `TG` - generated tokens per ubatch
|
||||
- `N_KV` - current KV cache size
|
||||
- `T_PP` - prompt processing time (i.e. time to first token)
|
||||
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
|
||||
- `T_TG` - time to generate all batches
|
||||
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
|
||||
|
||||
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|
||||
|-------|--------|--------|----------|----------|----------|----------|
|
||||
| 512 | 128 | 0 | 1.100 | 465.51 | 2.311 | 55.38 |
|
||||
| 512 | 128 | 512 | 1.183 | 432.97 | 1.895 | 67.55 |
|
||||
| 512 | 128 | 1024 | 1.305 | 392.38 | 2.071 | 61.81 |
|
||||
| 512 | 128 | 1536 | 1.279 | 400.42 | 2.164 | 59.14 |
|
||||
| 512 | 128 | 2048 | 1.571 | 325.96 | 2.280 | 56.14 |
|
||||
| 512 | 128 | 2560 | 1.431 | 357.87 | 2.418 | 52.94 |
|
||||
| 512 | 128 | 3072 | 1.515 | 337.93 | 2.566 | 49.88 |
|
||||
| 512 | 128 | 3584 | 1.588 | 322.34 | 2.722 | 47.03 |
|
||||
| 512 | 128 | 4096 | 1.675 | 305.70 | 2.864 | 44.69 |
|
||||
| 512 | 128 | 4608 | 1.769 | 289.50 | 2.999 | 42.68 |
|
||||
| 512 | 128 | 5120 | 1.845 | 277.48 | 3.102 | 41.26 |
|
||||
| 512 | 128 | 5632 | 1.893 | 270.46 | 3.219 | 39.76 |
|
||||
| 512 | 128 | 6144 | 1.953 | 262.20 | 3.348 | 38.23 |
|
||||
| 512 | 128 | 6656 | 2.018 | 253.71 | 3.474 | 36.84 |
|
||||
| 512 | 128 | 7168 | 2.078 | 246.34 | 3.589 | 35.66 |
|
||||
| 512 | 128 | 7680 | 2.140 | 239.22 | 3.717 | 34.43 |
|
||||
| 512 | 128 | 8192 | 2.196 | 233.15 | 3.854 | 33.21 |
|
||||
|
||||
### JSONL output
|
||||
|
||||
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
|
||||
|
||||
```json lines
|
||||
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 0, "t_pp": 1.093814, "speed_pp": 468.086884, "t_tg": 1.780312, "speed_tg": 71.897514 }
|
||||
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 512, "t_pp": 1.169302, "speed_pp": 437.868073, "t_tg": 1.897474, "speed_tg": 67.458099 }
|
||||
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1024, "t_pp": 1.183700, "speed_pp": 432.542053, "t_tg": 2.059179, "speed_tg": 62.160694 }
|
||||
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1536, "t_pp": 1.428625, "speed_pp": 358.386566, "t_tg": 2.160639, "speed_tg": 59.241734 }
|
||||
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 2048, "t_pp": 1.360647, "speed_pp": 376.291595, "t_tg": 2.274003, "speed_tg": 56.288403 }
|
||||
```
|
||||
118
examples/sweep-bench/sweep-bench-plot.py
Executable file
118
examples/sweep-bench/sweep-bench-plot.py
Executable file
@@ -0,0 +1,118 @@
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('file', nargs='+')
|
||||
args = parser.parse_args()
|
||||
|
||||
df = None
|
||||
|
||||
#for jsonl_file in args.file:
|
||||
# # Read JSONL file into DataFrame
|
||||
# df_part = pd.read_json(jsonl_file, lines=True)
|
||||
# df_part['label'] = jsonl_file
|
||||
# if df is None:
|
||||
# df = df_part
|
||||
# else:
|
||||
# df = pd.concat([df, df_part])
|
||||
#
|
||||
|
||||
|
||||
|
||||
for md_file in args.file:
|
||||
# Read markdown table file into DataFrame
|
||||
df_part = pd.read_csv(md_file, sep=r'\s*\|\s*', engine='python',
|
||||
header=0, skiprows=[1])
|
||||
|
||||
# Clean up columns (remove empty columns from markdown formatting)
|
||||
df_part = df_part.iloc[:, 1:-1]
|
||||
df_part.columns = [col.strip() for col in df_part.columns]
|
||||
|
||||
# Rename columns to match expected names
|
||||
df_part = df_part.rename(columns={
|
||||
'N_KV': 'n_kv',
|
||||
'S_PP t/s': 'speed_pp',
|
||||
'S_TG t/s': 'speed_tg'
|
||||
})
|
||||
|
||||
# Convert to numeric types
|
||||
df_part['n_kv'] = pd.to_numeric(df_part['n_kv'])
|
||||
df_part['speed_pp'] = pd.to_numeric(df_part['speed_pp'])
|
||||
df_part['speed_tg'] = pd.to_numeric(df_part['speed_tg'])
|
||||
|
||||
# Add label and append to main DataFrame
|
||||
df_part['label'] = md_file
|
||||
df = pd.concat([df, df_part]) if df is not None else df_part
|
||||
|
||||
# Group by label and n_kv, calculate mean and std for both speed metrics
|
||||
df_grouped = df.groupby(['label', 'n_kv']).agg({
|
||||
'speed_pp': ['mean', 'std'],
|
||||
'speed_tg': ['mean', 'std']
|
||||
}).reset_index()
|
||||
|
||||
# Flatten multi-index columns
|
||||
df_grouped.columns = ['label', 'n_kv', 'speed_pp_mean', 'speed_pp_std',
|
||||
'speed_tg_mean', 'speed_tg_std']
|
||||
|
||||
# Replace NaN with 0 (std for a single sample is NaN)
|
||||
df_grouped['speed_pp_std'] = df_grouped['speed_pp_std'].fillna(0)
|
||||
df_grouped['speed_tg_std'] = df_grouped['speed_tg_std'].fillna(0)
|
||||
|
||||
# Prepare ticks values for X axis (prune for readability)
|
||||
x_ticks = df['n_kv'].unique()
|
||||
while len(x_ticks) > 16:
|
||||
x_ticks = x_ticks[::2]
|
||||
|
||||
# Get unique labels and color map
|
||||
labels = df_grouped['label'].unique()
|
||||
colors = plt.cm.rainbow(np.linspace(0, 1, len(labels)))
|
||||
|
||||
# Create prompt processing plot
|
||||
plt.figure(figsize=(10, 6))
|
||||
ax1 = plt.gca()
|
||||
plt.grid()
|
||||
ax1.set_xticks(x_ticks)
|
||||
|
||||
# Plot each label's data
|
||||
for label, color in zip(labels, colors):
|
||||
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
|
||||
pp = ax1.errorbar(label_data['n_kv'], label_data['speed_pp_mean'],
|
||||
yerr=label_data['speed_pp_std'], color=color,
|
||||
marker='o', linestyle='-', label=label)
|
||||
|
||||
# Add labels and title
|
||||
ax1.set_xlabel('Context Length (tokens)')
|
||||
ax1.set_ylabel('Prompt Processing Rate (t/s)')
|
||||
plt.title('Prompt Processing Performance Comparison')
|
||||
ax1.legend(loc='upper right')
|
||||
|
||||
# Adjust layout and save
|
||||
plt.tight_layout()
|
||||
plt.savefig('performance_comparison_pp.png', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# Create token generation plot
|
||||
plt.figure(figsize=(10, 6))
|
||||
ax1 = plt.gca()
|
||||
plt.grid()
|
||||
ax1.set_xticks(x_ticks)
|
||||
|
||||
# Plot each model's data
|
||||
for label, color in zip(labels, colors):
|
||||
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
|
||||
tg = ax1.errorbar(label_data['n_kv'], label_data['speed_tg_mean'],
|
||||
yerr=label_data['speed_tg_std'], color=color,
|
||||
marker='s', linestyle='-', label=label)
|
||||
|
||||
# Add labels and title
|
||||
ax1.set_xlabel('Context Length (n_kv)')
|
||||
ax1.set_ylabel('Token Generation Rate (t/s)')
|
||||
plt.title('Token Generation Performance Comparison')
|
||||
ax1.legend(loc='upper right')
|
||||
|
||||
# Adjust layout and save
|
||||
plt.tight_layout()
|
||||
plt.savefig('performance_comparison_tg.png', bbox_inches='tight')
|
||||
plt.close()
|
||||
189
examples/sweep-bench/sweep-bench.cpp
Normal file
189
examples/sweep-bench/sweep-bench.cpp
Normal file
@@ -0,0 +1,189 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -c 8192 -b 2048 -ub 512\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// init LLM
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const unsigned int n_kv_max = llama_n_ctx(ctx);
|
||||
|
||||
|
||||
const llama_vocab * vocab = llama_get_vocab(ctx);
|
||||
llama_token bos = llama_token_bos_impl(*vocab);
|
||||
//llama_token eos = llama_token_eos_impl(*vocab);
|
||||
|
||||
const unsigned int n_vocab = llama_n_vocab(model);
|
||||
|
||||
// decode in batches of ctx_params.n_batch tokens
|
||||
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
if (ret != 0) {
|
||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_synchronize(ctx);
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
const unsigned int pp = params.n_ubatch;
|
||||
const unsigned int tg = params.n_ubatch / 4;
|
||||
|
||||
if (!params.sweep_bench_output_jsonl) {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("|%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s");
|
||||
LOG_TEE("|%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "------", "--------", "--------", "--------", "--------");
|
||||
}
|
||||
|
||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||
|
||||
// warm up
|
||||
{
|
||||
llama_batch_add(batch, bos, 0, { 0 }, false);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
|
||||
// clean up KV cache before generation
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
|
||||
|
||||
// first measure token generation performance at this context size
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
|
||||
for (unsigned int i = 0; i < tg; ++i) {
|
||||
llama_batch_clear(batch);
|
||||
llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_tg_end = ggml_time_us();
|
||||
|
||||
// clean up KV cache after generation
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
|
||||
|
||||
// prepare batch of pp size for prompt processing performance measurement
|
||||
llama_batch_clear(batch);
|
||||
|
||||
for (unsigned int i = 0; i < pp; ++i) {
|
||||
llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
// measure prompt processing performance
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto t_pp_end = ggml_time_us();
|
||||
|
||||
// calculate and print metrics
|
||||
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
|
||||
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
|
||||
|
||||
const float speed_pp = pp / t_pp;
|
||||
const float speed_tg = tg / t_tg;
|
||||
|
||||
if(params.sweep_bench_output_jsonl) {
|
||||
LOG_TEE(
|
||||
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||
"\"pp\": %d, \"tg\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f }\n",
|
||||
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||
pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg
|
||||
);
|
||||
} else {
|
||||
LOG_TEE("|%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user