mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-03-14 18:37:23 +00:00
472 lines
13 KiB
C++
472 lines
13 KiB
C++
#ifndef CPUINFER_OPERATOR_COMMON_HPP
|
|
#define CPUINFER_OPERATOR_COMMON_HPP
|
|
|
|
#include <map>
|
|
|
|
#include "../cpu_backend/worker_pool.h"
|
|
#include "ggml.h"
|
|
|
|
#if defined(__aarch64__) && defined(CPU_USE_KML)
|
|
#include <arm_sve.h>
|
|
#endif
|
|
|
|
#include <chrono>
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <stdexcept>
|
|
#include <type_traits>
|
|
|
|
// #define FORWARD_TIME_PROFILE
|
|
// #define FORWARD_TIME_REPORT
|
|
|
|
#define ASSERT_RELEASE(x, text) \
|
|
do { \
|
|
if (!(x)) { \
|
|
fprintf(stderr, "Assertion failed: %s, file %s, line %d\n", #x, __FILE__, __LINE__); \
|
|
fprintf(stderr, "Error message: %s\n", (text)); \
|
|
throw std::runtime_error((text)); \
|
|
} \
|
|
} while (0)
|
|
|
|
#define PUSH_MEM_REQ(ptr, size) mem_requests.append_pointer(&(ptr), (size))
|
|
|
|
#define PROFILE_RECORD_TIME_STAMP(name) \
|
|
do { \
|
|
auto end_time = std::chrono::high_resolution_clock::now(); \
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - last).count(); \
|
|
time_map[(name)] = duration; \
|
|
last = end_time; \
|
|
} while (0)
|
|
|
|
#define DO_TPS_LOAD_WEIGHTS(pool) \
|
|
(pool)->dispense_backend()->do_numa_job([this, pool, config](int numa_id) { \
|
|
this->tps[numa_id]->config_.physical_to_logical_map = config.physical_to_logical_map; \
|
|
this->tps[numa_id]->load_weights(); \
|
|
})
|
|
|
|
#define expert_map(m, x) (m != nullptr ? m[(x)] : (x))
|
|
|
|
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
|
|
inline T div_up(T x, T y) {
|
|
return (x + y - 1) / y;
|
|
}
|
|
|
|
template <typename T>
|
|
T* offset_pointer(T* ptr, size_t byte_offset) {
|
|
return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + byte_offset);
|
|
}
|
|
|
|
template <typename T>
|
|
size_t pointer_offset(T* ptr, T* b) {
|
|
return reinterpret_cast<size_t>(b) - reinterpret_cast<size_t>(ptr);
|
|
}
|
|
|
|
template <typename T>
|
|
const T* offset_pointer(const T* ptr, size_t byte_offset) {
|
|
return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + byte_offset);
|
|
}
|
|
|
|
template <typename T>
|
|
T* offset_pointer_row_major(T* t, int row, int col, size_t ld) {
|
|
return offset_pointer(t, row * ld) + col;
|
|
}
|
|
|
|
template <typename T>
|
|
T* offset_pointer_col_major(T* t, int row, int col, size_t ld) {
|
|
return offset_pointer(t, col * ld) + row;
|
|
}
|
|
|
|
class TimePerf {
|
|
protected:
|
|
std::string time_perf_name;
|
|
std::map<std::string, long> time_map;
|
|
std::chrono::time_point<std::chrono::high_resolution_clock> last;
|
|
std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
|
|
|
|
void forward_perf_start() {
|
|
start_time = std::chrono::high_resolution_clock::now();
|
|
last = start_time;
|
|
}
|
|
|
|
void perf_report() {
|
|
auto end_time = std::chrono::high_resolution_clock::now();
|
|
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
|
|
std::string output = time_perf_name + ", forward time: " + std::to_string(duration.count()) + " us";
|
|
// for (auto [name, t] : time_map) {
|
|
// double p = 100.0 * t / duration.count();
|
|
// // if (p < 1.0) {
|
|
// // continue; // Skip if the percentage is less than 1%
|
|
// // }
|
|
// output += ", " + name + ": " + std::to_string(t) + " us(" + std::to_string(size_t(round(p))) + "%)";
|
|
// }
|
|
// 反向遍历
|
|
for (auto it = time_map.rbegin(); it != time_map.rend(); ++it) {
|
|
const std::string& name = it->first;
|
|
long t = it->second;
|
|
double p = 100.0 * t / duration.count();
|
|
// if (p < 1.0) {
|
|
// continue; // Skip if the percentage is less than 1%
|
|
// }
|
|
output += ", " + name + ": " + std::to_string(t) + " us(" + std::to_string(size_t(round(p))) + "%)";
|
|
}
|
|
printf("%s\n", output.c_str());
|
|
}
|
|
};
|
|
|
|
struct TaskCounter {
|
|
std::vector<size_t> fold = {}, card = {};
|
|
|
|
TaskCounter(std::initializer_list<size_t> i) {
|
|
card.push_back(1);
|
|
for (auto j : i) {
|
|
push_back(j);
|
|
}
|
|
}
|
|
|
|
void push_back(size_t i) {
|
|
fold.push_back(i);
|
|
for (auto& c : card) {
|
|
c *= i;
|
|
}
|
|
card.push_back(1);
|
|
}
|
|
void push_back(std::vector<size_t> i) {
|
|
for (auto j : i) {
|
|
push_back(j);
|
|
}
|
|
}
|
|
size_t count() { return card[0]; }
|
|
size_t at(size_t id, size_t which) { return id % card.at(which) / card.at(which + 1); }
|
|
};
|
|
|
|
struct GeneralConfig {
|
|
size_t vocab_size;
|
|
size_t hidden_size;
|
|
|
|
size_t num_experts_per_tok;
|
|
size_t n_routed_experts;
|
|
size_t n_shared_experts;
|
|
size_t max_qlen = 4096;
|
|
|
|
void* lm_heads_ptr;
|
|
ggml_type lm_heads_type;
|
|
void* norm_weights_ptr;
|
|
ggml_type norm_weights_type;
|
|
void* token_embd_ptr;
|
|
ggml_type token_embd_type;
|
|
WorkerPool* pool = nullptr;
|
|
GeneralConfig() {}
|
|
};
|
|
|
|
struct GeneralMLAConfig {
|
|
size_t hidden_size;
|
|
size_t q_lora_rank;
|
|
size_t num_heads;
|
|
size_t nope_size;
|
|
size_t rope_size;
|
|
size_t kv_lora_rank;
|
|
|
|
int layer_idx = 0;
|
|
WorkerPool* pool = nullptr;
|
|
size_t token_count_in_page = 256; // token count in a page
|
|
size_t max_qlen = 1024;
|
|
size_t max_kvlen = 4096;
|
|
|
|
// rope
|
|
size_t max_position_embeddings;
|
|
double rope_scaling_factor = 1.0;
|
|
double rope_theta = 10000.0;
|
|
double rope_scaling_beta_fast;
|
|
double rope_scaling_beta_slow;
|
|
double rope_scaling_mscale;
|
|
double rope_scaling_mscale_all_dim;
|
|
double rope_scaling_original_max_position_embeddings;
|
|
|
|
void* q_a_proj;
|
|
void* q_a_norm = nullptr;
|
|
void* q_b_proj;
|
|
void* kv_a_proj_with_mqa;
|
|
void* kv_a_norm = nullptr;
|
|
void* kv_b_proj;
|
|
void* o_proj;
|
|
|
|
// for llamafile
|
|
ggml_type q_a_proj_type;
|
|
ggml_type q_a_norm_type;
|
|
ggml_type q_b_proj_type;
|
|
ggml_type kv_a_proj_with_mqa_type;
|
|
ggml_type kv_a_norm_type;
|
|
ggml_type kv_b_proj_type;
|
|
ggml_type w_o_type;
|
|
|
|
ggml_type input_type = GGML_TYPE_F32;
|
|
ggml_type output_type = GGML_TYPE_F32;
|
|
|
|
size_t m_block = 4;
|
|
size_t n_block = 4;
|
|
// for kvcache
|
|
size_t page_count = 200; // page count for kv cache
|
|
|
|
GeneralMLAConfig() {}
|
|
GeneralMLAConfig(size_t hidden_size, size_t q_lora_rank, size_t kv_lora_rank, size_t num_heads, size_t nope_size,
|
|
size_t rope_size)
|
|
: hidden_size(hidden_size),
|
|
q_lora_rank(q_lora_rank),
|
|
kv_lora_rank(kv_lora_rank),
|
|
num_heads(num_heads),
|
|
nope_size(nope_size),
|
|
rope_size(rope_size) {}
|
|
};
|
|
|
|
struct QuantConfig {
|
|
std::string quant_method = "";
|
|
int bits = 0;
|
|
int group_size = 0;
|
|
bool zero_point = false;
|
|
bool per_channel = false; // Per-channel quantization (GLM-4.7-FP8 style)
|
|
};
|
|
|
|
struct GeneralMOEConfig {
|
|
// Basic Config
|
|
int expert_num;
|
|
int num_experts_per_tok;
|
|
int hidden_size;
|
|
int intermediate_size;
|
|
|
|
int layer_idx = 0;
|
|
WorkerPool* pool = nullptr;
|
|
|
|
// SGLang offload
|
|
int num_gpu_experts = 0; // Computed from gpu_experts_mask
|
|
uint8_t* gpu_experts_mask = nullptr; // Bool mask: true = expert on GPU
|
|
void* physical_to_logical_map = nullptr;
|
|
|
|
// Compute num_gpu_experts from gpu_experts_mask
|
|
void compute_num_gpu_experts() {
|
|
num_gpu_experts = 0;
|
|
if (gpu_experts_mask) {
|
|
for (int i = 0; i < expert_num; i++) {
|
|
if (gpu_experts_mask[i]) num_gpu_experts++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if expert should be skipped (invalid, out of range, or on GPU)
|
|
inline bool should_skip_expert(int64_t expert_id) const {
|
|
return expert_id < 0 || expert_id >= expert_num || (gpu_experts_mask && gpu_experts_mask[expert_id]);
|
|
}
|
|
|
|
void* gate_proj;
|
|
void* up_proj;
|
|
void* down_proj;
|
|
|
|
void* gate_scale;
|
|
void* up_scale;
|
|
void* down_scale;
|
|
|
|
void* gate_zero;
|
|
void* up_zero;
|
|
void* down_zero;
|
|
|
|
QuantConfig quant_config;
|
|
|
|
// for amx
|
|
int max_len = 0;
|
|
std::vector<std::vector<void*>> gate_projs;
|
|
std::vector<std::vector<void*>> up_projs;
|
|
std::vector<std::vector<void*>> down_projs;
|
|
std::vector<std::vector<void*>> gate_scales;
|
|
std::vector<std::vector<void*>> up_scales;
|
|
std::vector<std::vector<void*>> down_scales;
|
|
std::vector<std::vector<void*>> gate_zeros;
|
|
std::vector<std::vector<void*>> up_zeros;
|
|
std::vector<std::vector<void*>> down_zeros;
|
|
|
|
std::string path;
|
|
bool save = false;
|
|
bool load = false;
|
|
|
|
// for llamafile
|
|
int m_block = 4;
|
|
int group_min_len = 0;
|
|
int group_max_len = 0;
|
|
int gate_type;
|
|
int up_type;
|
|
int down_type;
|
|
int hidden_type;
|
|
|
|
GeneralMOEConfig() {}
|
|
|
|
GeneralMOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size)
|
|
: expert_num(expert_num),
|
|
num_experts_per_tok(routed_expert_num),
|
|
hidden_size(hidden_size),
|
|
intermediate_size(intermediate_size) {}
|
|
|
|
int max_possible_qlen() { return std::max(max_len, group_max_len); }
|
|
};
|
|
|
|
struct GeneralGateConfig {
|
|
size_t hidden_size;
|
|
size_t num_experts_per_tok;
|
|
size_t n_routed_experts;
|
|
size_t n_group;
|
|
size_t topk_group;
|
|
|
|
bool norm_topk_prob = true;
|
|
float routed_scaling_factor = 2.5f;
|
|
|
|
std::string scoring_func = "sigmoid";
|
|
std::string topk_method = "noaux_tc";
|
|
|
|
int layer_idx = 0;
|
|
WorkerPool* pool = nullptr;
|
|
|
|
void* weight = nullptr;
|
|
ggml_type weight_type;
|
|
void* e_score_correction_bias = nullptr;
|
|
ggml_type e_score_correction_bias_type;
|
|
|
|
size_t max_seqlen = 25600;
|
|
|
|
GeneralGateConfig() = default;
|
|
|
|
GeneralGateConfig(int hidden_size, int num_experts_per_tok, int n_routed_experts, int n_group, int topk_group)
|
|
: hidden_size(hidden_size),
|
|
num_experts_per_tok(num_experts_per_tok),
|
|
n_routed_experts(n_routed_experts),
|
|
n_group(n_group),
|
|
topk_group(topk_group) {}
|
|
};
|
|
|
|
class MLA_Interface {
|
|
public:
|
|
virtual void forward(std::vector<int> qlens, std::vector<std::vector<int>> page_tables, std::vector<int> kv_lens,
|
|
const void* input, void* output) = 0;
|
|
};
|
|
|
|
class MoE_Interface {
|
|
public:
|
|
virtual void forward(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
|
|
void* output, bool incremental = false) = 0;
|
|
};
|
|
inline void init_ggml() {
|
|
static bool inited = false;
|
|
if (inited) {
|
|
return;
|
|
}
|
|
struct ggml_init_params params = {
|
|
0,
|
|
NULL,
|
|
true,
|
|
};
|
|
|
|
auto ctx_eval = ggml_init(params);
|
|
|
|
if (!ctx_eval) {
|
|
throw std::runtime_error("Failed to create ggml context");
|
|
}
|
|
inited = true;
|
|
}
|
|
|
|
template <typename A, typename B>
|
|
void convert_or_copy(A* dst, const B* src, size_t count) {
|
|
if constexpr (std::is_same_v<A, B>) {
|
|
// printf("Direct copy\n");
|
|
memcpy(dst, src, sizeof(A) * count);
|
|
} else {
|
|
if constexpr (std::is_same_v<A, float>) {
|
|
if constexpr (std::is_same_v<B, ggml_bf16_t>) {
|
|
// printf("Converting ggml_bf16_t to float\n");
|
|
ggml_bf16_to_fp32_row(src, dst, count);
|
|
} else if constexpr (std::is_same_v<B, ggml_fp16_t>) {
|
|
ggml_fp16_to_fp32_row(src, dst, count);
|
|
} else {
|
|
throw std::runtime_error("Unsupported conversion");
|
|
}
|
|
} else if constexpr (std::is_same_v<A, ggml_bf16_t>) {
|
|
if constexpr (std::is_same_v<B, float>) {
|
|
// printf("Converting float to ggml_bf16_t\n");
|
|
ggml_fp32_to_bf16_row(src, dst, count);
|
|
} else {
|
|
throw std::runtime_error("Unsupported conversion");
|
|
}
|
|
}
|
|
|
|
else {
|
|
throw std::runtime_error("Unsupported conversion");
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename A>
|
|
void convert_or_copy(A* dst, void* src, ggml_type type, size_t count) {
|
|
switch (type) {
|
|
case GGML_TYPE_BF16: {
|
|
auto src_bf16 = (ggml_bf16_t*)src;
|
|
convert_or_copy(dst, src_bf16, count);
|
|
break;
|
|
}
|
|
case GGML_TYPE_F16: {
|
|
#if defined(__aarch64__) && defined(CPU_USE_KML)
|
|
auto src_fp16 = (float16_t*)src;
|
|
convert_or_copy(dst, src_fp16, count);
|
|
#else
|
|
throw std::runtime_error("GGML_TYPE_F16 is not supported on this platform");
|
|
#endif
|
|
break;
|
|
}
|
|
case GGML_TYPE_F32: {
|
|
auto src_f32 = (float*)src;
|
|
convert_or_copy(dst, src_f32, count);
|
|
break;
|
|
}
|
|
default:
|
|
throw std::runtime_error("Unsupported type for conversion");
|
|
}
|
|
}
|
|
|
|
template <typename A>
|
|
void check_numerics(A* data, size_t count) {
|
|
for (size_t i = 0; i < count; i++) {
|
|
if (std::isnan(data[i]) || std::isinf(data[i])) {
|
|
printf("Numerics check failed at index %zu: value = %f\n", i, data[i]);
|
|
throw std::runtime_error("Numerics check failed");
|
|
}
|
|
}
|
|
printf("Numerics check passed for %zu elements.\n", count);
|
|
}
|
|
|
|
inline void debug_bf16(ggml_bf16_t* x) {
|
|
for (int i = 0; i < 10; i++) {
|
|
printf("%f ", ggml_bf16_to_fp32(x[i]));
|
|
}
|
|
printf("\n");
|
|
}
|
|
inline void debug_f32(float* x) {
|
|
for (int i = 0; i < 10; i++) {
|
|
printf("%f ", x[i]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
inline void debug_f32(float* x, size_t count) {
|
|
if (count < 10) {
|
|
for (size_t i = 0; i < count; i++) {
|
|
printf("%f ", x[i]);
|
|
}
|
|
} else {
|
|
for (size_t i = 0; i < 3; i++) {
|
|
printf("%f ", x[i]);
|
|
}
|
|
printf("...");
|
|
for (size_t i = count - 3; i < count; i++) {
|
|
printf("%f ", x[i]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
#endif
|