Merge branch 'main' into andrewkchan/try_trellis

This commit is contained in:
Andrew Keen Chan
2025-05-20 06:48:14 +00:00
109 changed files with 20602 additions and 6765 deletions

View File

@@ -51,5 +51,6 @@ else()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(sweep-bench)
add_subdirectory(tokenize)
endif()

View File

@@ -19,6 +19,8 @@
#include <fstream>
#include <unordered_map>
#include <algorithm>
#include <optional>
#include <sstream>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -39,6 +41,7 @@ struct Stats {
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
int n_as = 1;
};
class IMatrixCollector {
@@ -48,13 +51,59 @@ public:
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name);
void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; }
void print_layer_importance();
private:
std::unordered_map<std::string, Stats> m_stats;
gpt_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
int m_last_layer = 9999;
int m_last_ffn = -1;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
std::vector<float> m_last_input;
std::vector<float> m_ffn_input;
std::vector<std::pair<double,int>> m_layer_sim;
std::vector<std::pair<double,int>> m_attn_sim;
std::vector<std::pair<double,int>> m_ffn_sim;
bool m_collect_lsim = false;
std::optional<int> layer_index(const std::string& name) const {
if (name == m_params.output_tensor_name && m_last_layer < 199) {
return m_last_layer + 1;
}
if (auto pos = name.find("blk."); pos == 0) {
pos += 4;
if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) {
auto index_str = name.substr(pos, pos1 - pos);
std::istringstream str(index_str);
int index; str >> index;
if (!str.fail()) return index;
}
}
return std::nullopt;
}
static inline double cosine_similarity(int n, const float * x, const float * y) {
double sumxy = 0, sumx2 = 0, sumy2 = 0;
for (int j = 0; j < n; ++j) {
sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
}
double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0;
return cos_sim;
}
static inline void collect_cos_similarity(int nrow, int n, const float * x, const float * y, std::pair<double, int>& p) {
for (int row = 0; row < nrow; ++row) {
p.first += cosine_similarity(n, x, y);
p.second += 1;
x += n;
y += n;
}
}
static void print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim);
};
// remove any prefix and suffixes from the name
@@ -76,6 +125,45 @@ static std::string filter_tensor_name(const char * name) {
return wname;
}
void IMatrixCollector::print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim) {
if (sim.empty()) return;
std::vector<std::pair<float, int>> layers;
layers.reserve(sim.size());
for (int i = 0; i < int(sim.size()); ++i) {
if (sim[i].second > 0) layers.emplace_back(float(std::abs(sim[i].first/sim[i].second)), i);
}
if (layers.empty()) return;
std::sort(layers.begin(), layers.end());
printf("%s\n", msg);
//printf("======================== sorted layer importances\n");
int j = 0;
for (auto& p : layers) {
int i = p.second;
printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, sim[i].first/sim[i].second);
}
}
void IMatrixCollector::print_layer_importance() {
print_layer_importance("\n======================== sorted layer importances", m_layer_sim);
print_layer_importance("\n======================== sorted attention importances", m_attn_sim);
print_layer_importance("\n======================== sorted ffn importances", m_ffn_sim);
//printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
//if (m_layer_sim.empty()) return;
//std::vector<std::pair<float, int>> layers;
//layers.reserve(m_layer_sim.size());
//for (int i = 0; i < int(m_layer_sim.size()); ++i) {
// if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
//}
//if (layers.empty()) return;
//std::sort(layers.begin(), layers.end());
//printf("======================== sorted layer importances\n");
//int j = 0;
//for (auto& p : layers) {
// int i = p.second;
// printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
//}
}
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
GGML_UNUSED(user_data);
@@ -91,7 +179,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
//printf("wname = %s\n", wname.c_str());
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == m_params.output_tensor_name))) return false;
if (!(wname.substr(0, 4) == "blk." || ((m_params.process_output || m_collect_lsim) && wname == m_params.output_tensor_name))) return false;
return true;
}
@@ -107,6 +195,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
if (m_collect_lsim) {
if (wname.find(".ffn_") != std::string::npos) {
if (auto index = layer_index(wname); index.has_value() && *index == m_last_layer && *index != m_last_ffn) {
int n = src1->ne[0];
int nrow = t->op == GGML_OP_MUL_MAT_ID ? src1->ne[2] : src1->ne[1];
if (t->op == GGML_OP_MUL_MAT_ID) {
GGML_ASSERT(src1->ne[1] == 1);
}
if (m_ffn_input.empty()) {
m_ffn_input.resize(nrow*n);
} else {
if ((int)m_ffn_input.size() != nrow*n) {
printf("Oops, inconsistent ffn size\n"); exit(1);
}
}
std::memcpy(m_ffn_input.data(), data, nrow*n*sizeof(float));
if (m_ffn_input.size() != m_last_input.size()) {
printf("Oops, inconsistent ffn vs last_input size\n"); exit(1);
}
if (m_attn_sim.size() < *index + 1) m_attn_sim.resize(*index + 1);
auto& p = m_attn_sim[*index];
collect_cos_similarity(nrow, n, m_ffn_input.data(), m_last_input.data(), p);
m_last_ffn = *index;
}
}
}
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -132,11 +247,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
e.n_as = n_as;
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ABORT("fatal error");
}
else if (e.n_as != n_as) {
fprintf(stderr, "Oops: inconsistent n_as for %s (%d vs %d)\n", wname.c_str(), e.n_as, n_as);
}
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
}
@@ -177,6 +296,39 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
}
} else {
if (m_collect_lsim) {
// We only need to do it here and not in the MoE branch above because the first tensor in a layer
// never is a MoE tensor
if (auto index = layer_index(wname); index.has_value()) {
if (*index != m_last_layer) {
if (*index > 0) {
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
printf("Oops: different size (%d vs %d). Tensor name was %s, m_last_layer = %d\n",
(int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size(), src0->name, m_last_layer);
exit(1);
}
if (*index > m_layer_sim.size()) m_layer_sim.resize(*index);
auto& p = m_layer_sim[*index - 1];
collect_cos_similarity(src1->ne[1], src1->ne[0], m_last_input.data(), (const float *)data, p);
if (*index == m_last_ffn + 1) {
if (*index > m_ffn_sim.size()) m_ffn_sim.resize(*index);
auto& p1 = m_ffn_sim[*index-1];
collect_cos_similarity(src1->ne[1], src1->ne[0], m_ffn_input.data(), (const float *)data, p1);
}
}
m_last_layer = *index;
if (m_last_input.empty()) {
m_last_input.resize(src1->ne[0]*src1->ne[1]);
} else {
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
printf("Oops\n"); exit(1);
}
}
//printf("Copying src1 to m_last_input\n");
std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float));
}
}
}
auto & e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
@@ -190,7 +342,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) {
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
@@ -258,8 +410,38 @@ void IMatrixCollector::save_imatrix(int ncall) const {
}
if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%)", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
bool store_it = false;
if (kv.second.n_as > 1) {
int n_per_expert = n_all / kv.second.n_as;
std::vector<int> bad_experts;
bad_experts.reserve(kv.second.n_as);
for (int i = 0; i < kv.second.n_as; ++i) {
auto counts = kv.second.counts.data() + i*n_per_expert;
int nz_i = 0;
for (int j = 0; j < n_per_expert; ++j) {
if (counts[j] == 0) ++nz_i;
}
if (nz_i > 0) bad_experts.push_back(i);
}
fprintf(stderr, " %d out of %d experts are missing data", int(bad_experts.size()), kv.second.n_as);
if (bad_experts.size() < round(kv.second.n_as * 0.05)) {
fprintf(stderr, " Storing **but be aware**\n");
store_it = true;
for (auto i : bad_experts) {
auto counts = (int *)kv.second.counts.data() + i*n_per_expert;
auto values = (float *)kv.second.values.data() + i*n_per_expert;
for (int j = 0; j < n_per_expert; ++j) {
counts[j] = 1;
values[j] = 1;
}
}
}
}
if (!store_it) {
fprintf(stderr, " - skipping\n");
continue;
}
}
n_entries++;
@@ -587,7 +769,25 @@ int main(int argc, char ** argv) {
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) {
bool lsim = false;
//
// Do not pollute common with totally imatrix specific arguments as it was done in mainline.
// Instead, parse imatrix specific args here, push unknown args into a new array of args,
// and pass that to gpt_params_parse().
//
std::vector<char*> args;
args.reserve(argc);
args.push_back(argv[0]);
for (int i = 1; i < argc; ++i) {
std::string arg{argv[i]};
if (arg == "-lsim" || arg == "--layer-similarity") {
lsim = true;
} else {
args.push_back(argv[i]);
}
}
if (!gpt_params_parse(args.size(), args.data(), params)) {
print_usage(argc, argv, params);
return 1;
}
@@ -595,6 +795,7 @@ int main(int argc, char ** argv) {
params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params);
g_collector.set_collect_lsim(lsim);
for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
@@ -645,6 +846,7 @@ int main(int argc, char ** argv) {
}
g_collector.save_imatrix();
g_collector.print_layer_importance();
llama_print_timings(ctx);

View File

@@ -1,3 +1,10 @@
//
// Copyright (C) 2023-2025 The llama.cpp authors
// Copyright (C) 2024-2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#include <algorithm>
#include <array>
#include <cassert>
@@ -41,6 +48,12 @@ static uint64_t get_time_ns() {
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
}
template <typename T1, typename T2>
std::ostream& operator<<(std::ostream& str, const std::pair<T1, T2>& item) {
str << '{' << item.first << ", " << item.second << '}';
return str;
}
template<class T>
static std::string join(const std::vector<T> & values, const std::string & delim) {
std::ostringstream str;
@@ -215,6 +228,9 @@ static std::string pair_str(const std::pair<int, int> & p) {
return buf;
}
// Ser = Smart Expert Reduction
using Ser = std::pair<int,float>;
struct cmd_params {
std::vector<std::string> model;
std::vector<int> n_prompt;
@@ -225,21 +241,27 @@ struct cmd_params {
std::vector<int> n_ubatch;
std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<std::pair<int,int>> n_threads;
std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
std::vector<int> mla_attn;
std::vector<int> attn_max_batch;
std::vector<Ser> ser;
std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
std::vector<llama_model_tensor_buft_override> buft_overrides;
ggml_numa_strategy numa;
int reps;
bool verbose;
bool warmup;
bool repack = false;
bool fmoe = false;
bool use_thp = false;
output_formats output_format;
output_formats output_format_stderr;
};
@@ -254,21 +276,27 @@ static const cmd_params cmd_params_defaults = {
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* n_threads */ {{cpu_get_num_math(), cpu_get_num_math()}},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* flash_attn */ {false},
/* mla_attn */ {0},
/* attn_max_batch */ {0},
/* ser */ {{-1,0.0f}},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* embeddings */ {false},
/* buft_overrides */ {},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* verbose */ false,
/* warmup */ true,
/* repack */ false,
/* use_thp */ false,
/* fmoe */ false,
/* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
};
@@ -288,12 +316,16 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -tgb, --threads-gen-batch <n1,n2> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mla, --mla-attn <0|1|2> (default: %s)\n", join(cmd_params_defaults.mla_attn, ",").c_str());
printf(" -amb, --attn-max-batch <i> (default: %s)\n", join(cmd_params_defaults.attn_max_batch, ",").c_str());
printf(" -ser, --smart-expert-reduction <i,f>(default: %s)\n", join(cmd_params_defaults.attn_max_batch, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
@@ -304,6 +336,9 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf(" -w, --warmup <0|1> (default: %s)\n", cmd_params_defaults.warmup ? "1" : "0");
printf(" -rtr, --run-time-repack <0|1> (default: %s)\n", cmd_params_defaults.repack ? "1" : "0");
printf(" -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0");
printf(" -ot, --override-tensor pattern (default: none)\n");
printf(" -fmoe, --fused-moe <0|1> (default: %s)\n", cmd_params_defaults.fmoe? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
}
@@ -336,10 +371,68 @@ static ggml_type ggml_type_from_name(const std::string & s) {
if (s == "q6_0") {
return GGML_TYPE_Q6_0;
}
if (s == "q8_KV") {
return GGML_TYPE_Q8_KV;
}
return GGML_TYPE_COUNT;
}
namespace {
bool parse_buft_overrides(const std::string& value, std::vector<llama_model_tensor_buft_override>& overrides) {
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
if (buft_list.empty()) {
// enumerate all the devices and add their buffer types to the list
for (size_t i = 0; i < ggml_backend_reg_get_count(); ++i) {
//auto * dev = ggml_backend_reg_get_name(i);
auto * buft = ggml_backend_reg_get_default_buffer_type(i);
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
}
}
}
for (const auto & override : string_split<std::string>(value, ',')) {
std::string::size_type pos = override.find('=');
if (pos == std::string::npos) {
fprintf(stderr, "Invalid buft override argument %s\n", value.c_str());
return false;
}
std::string tensor_name = override.substr(0, pos);
std::string buffer_type = override.substr(pos + 1);
if (buft_list.find(buffer_type) == buft_list.end()) {
fprintf(stderr, "Available buffer types:\n");
for (const auto & it : buft_list) {
fprintf(stderr, " %s\n", ggml_backend_buft_name(it.second));
}
return false;
}
overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
}
return true;
}
template<class T1, class T2>
std::vector<std::pair<T1,T2>> string_split_pairs(const std::string & str, char delim) {
std::vector<std::pair<T1,T2>> values;
std::istringstream str_stream(str);
std::string token;
T1 first_value;
int i = 0;
while (std::getline(str_stream, token, delim)) {
std::istringstream token_stream(token);
if (i%2 == 0) {
token_stream >> first_value;
if (token_stream.fail()) return {};
} else {
T2 value;
token_stream >> value;
if (token_stream.fail()) return {};
values.emplace_back(first_value, value);
}
i++;
}
return values;
}
}
static cmd_params parse_cmd_params(int argc, char ** argv) {
cmd_params params;
@@ -459,7 +552,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
params.n_threads.reserve(params.n_threads.size() + p.size());
for (auto t : p) params.n_threads.push_back({t, t});
//params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-tgb" || arg == "--threads-gen-batch") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto ps = string_split<std::string>(argv[i], ';');
for (auto& s : ps) {
auto p = string_split<int>(s.c_str(), ',');
if (p.size() != 2) {
invalid_param = true;
break;
}
params.n_threads.push_back({p[0], p[1]});
}
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
@@ -526,6 +635,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mla" || arg == "--mla-attn") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.mla_attn.insert(params.mla_attn.end(), p.begin(), p.end());
} else if (arg == "-amb" || arg == "--attn-max-batch") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.attn_max_batch.insert(params.attn_max_batch.end(), p.begin(), p.end());
} else if (arg == "-ser" || arg == "--smart-expert-reduction") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split_pairs<int,float>(argv[i], split_delim);
params.ser.insert(params.ser.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
@@ -594,6 +724,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.repack = std::stoi(argv[i]);
} else if (arg == "-thp" || arg == "--transparent-huge-pages") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.use_thp = std::stoi(argv[i]);
} else if (arg == "-fmoe" || arg == "--fused-moe") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.fmoe = std::stoi(argv[i]);
} else if (arg == "-ot" || arg == "--override-tensor") {
if (++i >= argc) {
invalid_param = true;
break;
}
if (!parse_buft_overrides(std::string{argv[i]}, params.buft_overrides)) {
fprintf(stderr, "error: Invalid tensor buffer type override: %s\n", argv[i]);
invalid_param = true;
break;
}
} else {
invalid_param = true;
break;
@@ -621,10 +773,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
if (params.mla_attn.empty()) { params.mla_attn = cmd_params_defaults.mla_attn; }
if (params.attn_max_batch.empty()){ params.attn_max_batch = cmd_params_defaults.attn_max_batch; }
if (params.ser.empty()) { params.ser = cmd_params_defaults.ser; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
if (!params.buft_overrides.empty()) params.buft_overrides.emplace_back(llama_model_tensor_buft_override{nullptr, nullptr});
return params;
}
@@ -649,17 +805,23 @@ struct cmd_params_instance {
int n_ubatch;
ggml_type type_k;
ggml_type type_v;
int n_threads;
std::pair<int,int> n_threads;
int n_gpu_layers;
std::string rpc_servers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
int mla_attn;
int attn_max_batch;
Ser ser;
std::vector<float> tensor_split;
bool use_mmap;
bool embeddings;
bool repack = false;
bool fmoe = false;
bool use_thp = false;
const llama_model_tensor_buft_override* buft_overrides;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
@@ -673,6 +835,8 @@ struct cmd_params_instance {
mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
mparams.repack_tensors = repack;
mparams.use_thp = use_thp;
mparams.tensor_buft_overrides = buft_overrides;
return mparams;
}
@@ -685,6 +849,7 @@ struct cmd_params_instance {
main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
repack == other.repack &&
use_thp == other.use_thp &&
tensor_split == other.tensor_split;
}
@@ -698,6 +863,11 @@ struct cmd_params_instance {
cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn = flash_attn;
cparams.mla_attn = mla_attn;
cparams.attn_max_batch = attn_max_batch;
cparams.fused_moe_up_gate = fmoe;
cparams.min_experts = ser.first;
cparams.thresh_experts = ser.second;
cparams.embeddings = embeddings;
return cparams;
@@ -722,6 +892,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
for (const auto & mla : params.mla_attn)
for (const auto & amb : params.attn_max_batch)
for (const auto & ser : params.ser)
for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
@@ -743,10 +916,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .mla_attn = */ mla,
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
/* .fmoe = */ params.fmoe,
/* .use_thp = */ params.use_thp,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
}
@@ -771,10 +950,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .mla_attn = */ mla,
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
/* .fmoe = */ params.fmoe,
/* .use_thp = */ params.use_thp,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
}
@@ -799,10 +984,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .mla_attn = */ mla,
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
/* .fmoe = */ params.fmoe,
/* .use_thp = */ params.use_thp,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
}
@@ -827,10 +1018,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .mla_attn = */ mla,
/* .attn_max_b = */ amb,
/* .ser = */ ser,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
/* .repack = */ params.repack,
/* .fmoe = */ params.fmoe,
/* .use_thp = */ params.use_thp,
/* .buft_overrides=*/ params.buft_overrides.data(),
};
instances.push_back(instance);
}
@@ -857,7 +1054,7 @@ struct test {
uint64_t model_n_params;
int n_batch;
int n_ubatch;
int n_threads;
std::pair<int,int> n_threads;
bool has_rpc;
ggml_type type_k;
ggml_type type_v;
@@ -866,10 +1063,15 @@ struct test {
int main_gpu;
bool no_kv_offload;
bool flash_attn;
int mla_attn;
int attn_max_batch;
Ser ser;
std::vector<float> tensor_split;
bool use_mmap;
bool embeddings;
bool repack = false;
bool fmoe = false;
bool use_thp = false;
int n_prompt;
int n_gen;
std::string test_time;
@@ -895,10 +1097,15 @@ struct test {
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
flash_attn = inst.flash_attn;
mla_attn = inst.mla_attn;
attn_max_batch = inst.attn_max_batch;
ser = inst.ser;
tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
embeddings = inst.embeddings;
repack = inst.repack;
fmoe = inst.fmoe;
use_thp = inst.use_thp;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
test_kind = inst.test_kind;
@@ -988,8 +1195,8 @@ struct test {
"n_batch", "n_ubatch",
"n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings", "repack",
"main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser",
"tensor_split", "use_mmap", "embeddings", "repack", "fused_moe", "use_thp",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts", "test",
@@ -1004,13 +1211,14 @@ struct test {
field == "n_threads" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" ||
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack") {
field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" ||
field == "fused_moe") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
@@ -1035,6 +1243,12 @@ struct test {
tensor_split_str += "/";
}
}
auto ser_to_string = [] (const Ser& ser) {
std::ostringstream str;
str << ser.first << ',' << ser.second;
return str.str();
};
bool is_gen = n_gen > 0;
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
@@ -1042,10 +1256,12 @@ struct test {
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(is_gen ? n_threads.first : n_threads.second), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(repack),
std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(repack), std::to_string(fmoe), std::to_string(use_thp),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts()),
@@ -1208,12 +1424,27 @@ struct markdown_printer : public printer {
if (field == "flash_attn") {
return 2;
}
if (field == "mla_attn") {
return 3;
}
if (field == "attn_max_batch") {
return 5;
}
if (field == "ser") {
return 10;
}
if (field == "use_mmap") {
return 4;
}
if (field == "repack") {
return 3;
}
if (field == "use_thp") {
return 3;
}
if (field == "fused_moe") {
return 4;
}
if (field == "test") {
return 13;
}
@@ -1242,12 +1473,27 @@ struct markdown_printer : public printer {
if (field == "flash_attn") {
return "fa";
}
if (field == "mla_attn") {
return "mla";
}
if (field == "attn_max_batch") {
return "amb";
}
if (field == "attn_max_batch") {
return "ser";
}
if (field == "use_mmap") {
return "mmap";
}
if (field == "repack") {
return "rtr";
}
if (field == "use_thp") {
return "thp";
}
if (field == "fused_moe") {
return "fmoe";
}
if (field == "embeddings") {
return "embd";
}
@@ -1294,6 +1540,15 @@ struct markdown_printer : public printer {
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
fields.emplace_back("flash_attn");
}
if (params.mla_attn.size() > 1 || params.mla_attn != cmd_params_defaults.mla_attn) {
fields.emplace_back("mla_attn");
}
if (params.attn_max_batch.size() > 1 || params.attn_max_batch != cmd_params_defaults.mla_attn) {
fields.emplace_back("attn_max_batch");
}
if (params.ser.size() > 1 || params.ser != cmd_params_defaults.ser) {
fields.emplace_back("ser");
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.emplace_back("tensor_split");
}
@@ -1306,6 +1561,12 @@ struct markdown_printer : public printer {
if (params.repack != cmd_params_defaults.repack) {
fields.emplace_back("repack");
}
if (params.use_thp != cmd_params_defaults.use_thp) {
fields.emplace_back("use_thp");
}
if (params.fmoe != cmd_params_defaults.fmoe) {
fields.emplace_back("fused_moe");
}
fields.emplace_back("test");
fields.emplace_back("t/s");
@@ -1557,10 +1818,10 @@ int main(int argc, char ** argv) {
if (params.warmup) {
if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
test_prompt(ctx, 1, 0, t.n_batch, t.n_threads.second);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
test_gen(ctx, 1, 0, t.n_threads.first);
}
}
@@ -1570,11 +1831,11 @@ int main(int argc, char ** argv) {
uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads.second);
}
if (t.test_kind == TEST_KIND_GP) t_start = get_time_ns();
if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads.first);
}
uint64_t t_ns = get_time_ns() - t_start;

View File

@@ -1,3 +1,10 @@
//
// Copyright (C) 2023-2025 The llama.cpp authors
// Copyright (C) 2024-2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#include "common.h"
#include "llama.h"
@@ -126,7 +133,7 @@ static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob
max_logit = std::max(max_logit, logits[i]);
min_logit = std::min(min_logit, logits[i]);
}
min_logit = std::max(min_logit, max_logit - 16);
min_logit = std::max(min_logit, max_logit - 24);
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) {
sum_exp += expf(logits[i] - max_logit);
@@ -166,7 +173,7 @@ static void process_logits(
break;
}
lock.unlock();
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
const results_log_softmax results = log_softmax(n_vocab, logits + int64_t(i)*n_vocab, tokens[i+1]);
const double v = -results.log_softmax;
local_nll += v;
local_nll2 += v*v;
@@ -200,7 +207,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
break;
}
lock.unlock();
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + int64_t(i)*nv, tokens[i+1]);
local_nll += v;
local_nll2 += v*v;
}
@@ -618,7 +625,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
if (num_batches > 1 && n_outputs > 0) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
logits.insert(logits.end(), batch_logits, batch_logits + int64_t(n_outputs) * n_vocab);
}
}

View File

@@ -1,3 +1,10 @@
//
// Copyright (C) 2023-2025 The llama.cpp authors
// Copyright (C) 2024-2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#define LLAMA_API_INTERNAL
#include "common.h"
#include "ggml.h"

View File

@@ -1,3 +1,10 @@
//
// Copyright (C) 2023-2025 The llama.cpp authors
// Copyright (C) 2024-2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//
#include "common.h"
#include "llama.h"
@@ -58,6 +65,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_0_R4", LLAMA_FTYPE_MOSTLY_Q5_0_R4, " 5.50 bpw quantization", },
{ "Q6_0_R4", LLAMA_FTYPE_MOSTLY_Q6_0_R4, " 6.50 bpw quantization", },
{ "Q8_0_R8", LLAMA_FTYPE_MOSTLY_Q8_0_R8, " 8.50 bpw quantization", },
{ "Q8_KV", LLAMA_FTYPE_MOSTLY_Q8_KV, " 8.00 bpw quantization", },
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
{ "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", },
{ "IQ4_KS_R4",LLAMA_FTYPE_MOSTLY_IQ4_KS_R4,"IQ4_KS repacked", },
@@ -85,6 +93,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
{ "Q6_K_R4", LLAMA_FTYPE_MOSTLY_Q6_K_R4, "Q6_K repacked", },
{ "Q8_K_R8", LLAMA_FTYPE_MOSTLY_Q8_K_R8, "Q8_K repacked", },
{ "Q8_KV_R8", LLAMA_FTYPE_MOSTLY_Q8_KV_R8, "Q8_KV repacked", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
@@ -136,15 +145,19 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
printf(" --hide-imatrix: do not store imatrix details in the quantized model\n");
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n");
printf(" --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n");
printf(" --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n");
printf(" --repack-pattern Comma separated list of regexs to use for matching tensor names to be repacked.\n\n");
printf("Additional specific tensor quantization types used in the custom quant scheme 'CQS (default is Q2_K):\n");
printf(" --attn-q-type ggml_type: use this ggml_type for the attn_q.weight tensor.\n");
printf(" --attn-k-type ggml_type: use this ggml_type for the attn_k.weight tensor.\n");
@@ -291,6 +304,28 @@ static ggml_type parse_ggml_type(const char * arg) {
return result;
}
using CustomQ = std::pair<std::string, ggml_type>;
static bool parse_custom_quants(const std::string& arg, std::vector<CustomQ>& custom_quants) {
for (const auto & item : string_split<std::string>(arg, ',')) {
auto pos = item.find('=');
if (pos == std::string::npos) {
fprintf(stderr, "Invalid custom quantization input %s\n", arg.c_str());
return false;
}
auto pattern = item.substr(0, pos);
auto type_as_string = item.substr(pos + 1);
auto type = parse_ggml_type(type_as_string.c_str());
if (type == GGML_TYPE_COUNT) {
fprintf(stderr, "Invalid quantization type '%s' in custom quantization input %s\n", type_as_string.c_str(), item.c_str());
return false;
}
printf("Adding custom rule %s -> %s\n", pattern.c_str(), ggml_type_name(type));
custom_quants.emplace_back(std::move(pattern), type);
}
return true;
}
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
@@ -302,12 +337,26 @@ int main(int argc, char ** argv) {
std::string imatrix_file;
std::vector<std::string> included_weights, excluded_weights;
std::vector<llama_model_kv_override> kv_overrides;
std::vector<CustomQ> custom_quants;
std::vector<std::string> repack_patterns;
bool hide_imatrix = false;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
params.ignore_imatrix_rules = true;
} else if (strcmp(argv[arg_idx], "--repack") == 0) {
params.only_repack = true;
} else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {
if (arg_idx < argc-1) {
auto p = string_split(argv[++arg_idx], ',');
repack_patterns.insert(repack_patterns.end(), p.begin(), p.end());
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
if (arg_idx < argc-1) {
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
@@ -372,6 +421,10 @@ int main(int argc, char ** argv) {
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--custom-q") == 0) {
if (arg_idx == argc-1 || !parse_custom_quants(argv[++arg_idx], custom_quants)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -382,6 +435,8 @@ int main(int argc, char ** argv) {
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--hide-imatrix") == 0) {
hide_imatrix = true;
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
if (arg_idx < argc-1) {
included_weights.emplace_back(argv[++arg_idx]);
@@ -401,6 +456,10 @@ int main(int argc, char ** argv) {
}
}
if (!repack_patterns.empty()) {
params.repack_pattern = &repack_patterns;
}
if (argc - arg_idx < 2) {
printf("%s: bad arguments\n", argv[0]);
usage(argv[0]);
@@ -418,7 +477,11 @@ int main(int argc, char ** argv) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
if (hide_imatrix) {
strncpy(kvo.val_str, "top_secret", 127);
} else {
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
}
kvo.val_str[127] = '\0';
kv_overrides.emplace_back(std::move(kvo));
}
@@ -426,7 +489,11 @@ int main(int argc, char ** argv) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
if (hide_imatrix) {
strncpy(kvo.val_str, "top_secret", 127);
} else {
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
}
kvo.val_str[127] = '\0';
kv_overrides.emplace_back(std::move(kvo));
}
@@ -435,7 +502,11 @@ int main(int argc, char ** argv) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = imatrix_data.size();
if (hide_imatrix) {
kvo.val_i64 = 0;
} else {
kvo.val_i64 = imatrix_data.size();
}
kv_overrides.emplace_back(std::move(kvo));
}
@@ -443,7 +514,11 @@ int main(int argc, char ** argv) {
llama_model_kv_override kvo;
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
kvo.val_i64 = m_last_call;
if (hide_imatrix) {
kvo.val_i64 = 0;
} else {
kvo.val_i64 = m_last_call;
}
kv_overrides.emplace_back(std::move(kvo));
}
}
@@ -452,6 +527,9 @@ int main(int argc, char ** argv) {
kv_overrides.back().key[0] = 0;
params.kv_overrides = &kv_overrides;
}
if (!custom_quants.empty()) {
params.custom_quants = &custom_quants;
}
llama_backend_init();

View File

@@ -0,0 +1,5 @@
set(TARGET llama-sweep-bench)
add_executable(${TARGET} sweep-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -0,0 +1,65 @@
# ik_llama.cpp/example/sweep-bench
Benchmark the prompt processing and token generation performance of `ik_llama.cpp`
by doing a sweep over a whole context size and gathering performance metrics
in each ubatch-sized window. Only a single token sequence is used.
The benchmark steps are:
for each ubatch-sized window in context:
1. generate ubatch/4 tokens (not the whole window to save some time)
2. measure generation performance
3. remove generated tokens from KV cache
4. prepare a ubatch-sized batch of random tokens
4. process prepated batch
5. measure prompt processing performance
The purpose of the benchmark is to visualize how the performance changes with
the context size without averaging the metrics values over the whole context.
## Usage
./llama-sweep-bench -c 8704 -ub 512 -m models/Meta-Llama-3.2-3B-Instruct-Q8_0.gguf
## Sample results
- `PP` - prompt tokens per ubatch
- `TG` - generated tokens per ubatch
- `N_KV` - current KV cache size
- `T_PP` - prompt processing time (i.e. time to first token)
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
- `T_TG` - time to generate all batches
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|-------|--------|--------|----------|----------|----------|----------|
| 512 | 128 | 0 | 1.100 | 465.51 | 2.311 | 55.38 |
| 512 | 128 | 512 | 1.183 | 432.97 | 1.895 | 67.55 |
| 512 | 128 | 1024 | 1.305 | 392.38 | 2.071 | 61.81 |
| 512 | 128 | 1536 | 1.279 | 400.42 | 2.164 | 59.14 |
| 512 | 128 | 2048 | 1.571 | 325.96 | 2.280 | 56.14 |
| 512 | 128 | 2560 | 1.431 | 357.87 | 2.418 | 52.94 |
| 512 | 128 | 3072 | 1.515 | 337.93 | 2.566 | 49.88 |
| 512 | 128 | 3584 | 1.588 | 322.34 | 2.722 | 47.03 |
| 512 | 128 | 4096 | 1.675 | 305.70 | 2.864 | 44.69 |
| 512 | 128 | 4608 | 1.769 | 289.50 | 2.999 | 42.68 |
| 512 | 128 | 5120 | 1.845 | 277.48 | 3.102 | 41.26 |
| 512 | 128 | 5632 | 1.893 | 270.46 | 3.219 | 39.76 |
| 512 | 128 | 6144 | 1.953 | 262.20 | 3.348 | 38.23 |
| 512 | 128 | 6656 | 2.018 | 253.71 | 3.474 | 36.84 |
| 512 | 128 | 7168 | 2.078 | 246.34 | 3.589 | 35.66 |
| 512 | 128 | 7680 | 2.140 | 239.22 | 3.717 | 34.43 |
| 512 | 128 | 8192 | 2.196 | 233.15 | 3.854 | 33.21 |
### JSONL output
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
```json lines
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 0, "t_pp": 1.093814, "speed_pp": 468.086884, "t_tg": 1.780312, "speed_tg": 71.897514 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 512, "t_pp": 1.169302, "speed_pp": 437.868073, "t_tg": 1.897474, "speed_tg": 67.458099 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1024, "t_pp": 1.183700, "speed_pp": 432.542053, "t_tg": 2.059179, "speed_tg": 62.160694 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1536, "t_pp": 1.428625, "speed_pp": 358.386566, "t_tg": 2.160639, "speed_tg": 59.241734 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 2048, "t_pp": 1.360647, "speed_pp": 376.291595, "t_tg": 2.274003, "speed_tg": 56.288403 }
```

View File

@@ -0,0 +1,118 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('file', nargs='+')
args = parser.parse_args()
df = None
#for jsonl_file in args.file:
# # Read JSONL file into DataFrame
# df_part = pd.read_json(jsonl_file, lines=True)
# df_part['label'] = jsonl_file
# if df is None:
# df = df_part
# else:
# df = pd.concat([df, df_part])
#
for md_file in args.file:
# Read markdown table file into DataFrame
df_part = pd.read_csv(md_file, sep=r'\s*\|\s*', engine='python',
header=0, skiprows=[1])
# Clean up columns (remove empty columns from markdown formatting)
df_part = df_part.iloc[:, 1:-1]
df_part.columns = [col.strip() for col in df_part.columns]
# Rename columns to match expected names
df_part = df_part.rename(columns={
'N_KV': 'n_kv',
'S_PP t/s': 'speed_pp',
'S_TG t/s': 'speed_tg'
})
# Convert to numeric types
df_part['n_kv'] = pd.to_numeric(df_part['n_kv'])
df_part['speed_pp'] = pd.to_numeric(df_part['speed_pp'])
df_part['speed_tg'] = pd.to_numeric(df_part['speed_tg'])
# Add label and append to main DataFrame
df_part['label'] = md_file
df = pd.concat([df, df_part]) if df is not None else df_part
# Group by label and n_kv, calculate mean and std for both speed metrics
df_grouped = df.groupby(['label', 'n_kv']).agg({
'speed_pp': ['mean', 'std'],
'speed_tg': ['mean', 'std']
}).reset_index()
# Flatten multi-index columns
df_grouped.columns = ['label', 'n_kv', 'speed_pp_mean', 'speed_pp_std',
'speed_tg_mean', 'speed_tg_std']
# Replace NaN with 0 (std for a single sample is NaN)
df_grouped['speed_pp_std'] = df_grouped['speed_pp_std'].fillna(0)
df_grouped['speed_tg_std'] = df_grouped['speed_tg_std'].fillna(0)
# Prepare ticks values for X axis (prune for readability)
x_ticks = df['n_kv'].unique()
while len(x_ticks) > 16:
x_ticks = x_ticks[::2]
# Get unique labels and color map
labels = df_grouped['label'].unique()
colors = plt.cm.rainbow(np.linspace(0, 1, len(labels)))
# Create prompt processing plot
plt.figure(figsize=(10, 6))
ax1 = plt.gca()
plt.grid()
ax1.set_xticks(x_ticks)
# Plot each label's data
for label, color in zip(labels, colors):
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
pp = ax1.errorbar(label_data['n_kv'], label_data['speed_pp_mean'],
yerr=label_data['speed_pp_std'], color=color,
marker='o', linestyle='-', label=label)
# Add labels and title
ax1.set_xlabel('Context Length (tokens)')
ax1.set_ylabel('Prompt Processing Rate (t/s)')
plt.title('Prompt Processing Performance Comparison')
ax1.legend(loc='upper right')
# Adjust layout and save
plt.tight_layout()
plt.savefig('performance_comparison_pp.png', bbox_inches='tight')
plt.close()
# Create token generation plot
plt.figure(figsize=(10, 6))
ax1 = plt.gca()
plt.grid()
ax1.set_xticks(x_ticks)
# Plot each model's data
for label, color in zip(labels, colors):
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
tg = ax1.errorbar(label_data['n_kv'], label_data['speed_tg_mean'],
yerr=label_data['speed_tg_std'], color=color,
marker='s', linestyle='-', label=label)
# Add labels and title
ax1.set_xlabel('Context Length (n_kv)')
ax1.set_ylabel('Token Generation Rate (t/s)')
plt.title('Token Generation Performance Comparison')
ax1.legend(loc='upper right')
# Adjust layout and save
plt.tight_layout()
plt.savefig('performance_comparison_tg.png', bbox_inches='tight')
plt.close()

View File

@@ -0,0 +1,189 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "llama-vocab.h"
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <string>
#include <vector>
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 8192 -b 2048 -ub 512\n", argv[0]);
LOG_TEE("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv);
return 1;
}
// init LLM
llama_backend_init();
llama_numa_init(params.numa);
// initialize the model
llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
const unsigned int n_kv_max = llama_n_ctx(ctx);
const llama_vocab * vocab = llama_get_vocab(ctx);
llama_token bos = llama_token_bos_impl(*vocab);
//llama_token eos = llama_token_eos_impl(*vocab);
const unsigned int n_vocab = llama_n_vocab(model);
// decode in batches of ctx_params.n_batch tokens
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
};
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
llama_synchronize(ctx);
}
return true;
};
const unsigned int pp = params.n_ubatch;
const unsigned int tg = params.n_ubatch / 4;
if (!params.sweep_bench_output_jsonl) {
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s");
LOG_TEE("|%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "------", "--------", "--------", "--------", "--------");
}
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// warm up
{
llama_batch_add(batch, bos, 0, { 0 }, false);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
llama_batch_clear(batch);
llama_kv_cache_clear(ctx);
for (unsigned int n_kv = 0; n_kv < n_kv_max; n_kv += params.n_ubatch) {
// clean up KV cache before generation
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
// first measure token generation performance at this context size
const auto t_tg_start = ggml_time_us();
for (unsigned int i = 0; i < tg; ++i) {
llama_batch_clear(batch);
llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, true);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
const auto t_tg_end = ggml_time_us();
// clean up KV cache after generation
llama_kv_cache_seq_rm(ctx, 0, n_kv, -1);
// prepare batch of pp size for prompt processing performance measurement
llama_batch_clear(batch);
for (unsigned int i = 0; i < pp; ++i) {
llama_batch_add(batch, std::rand() % n_vocab, n_kv + i, { 0 }, false);
}
batch.logits[batch.n_tokens - 1] = true;
// measure prompt processing performance
const auto t_pp_start = ggml_time_us();
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
const auto t_pp_end = ggml_time_us();
// calculate and print metrics
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
const float speed_pp = pp / t_pp;
const float speed_tg = tg / t_tg;
if(params.sweep_bench_output_jsonl) {
LOG_TEE(
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
"\"pp\": %d, \"tg\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f }\n",
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg
);
} else {
LOG_TEE("|%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
}
}
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}