/** * @Description : * @Author : Jianwei Dong * @Date : 2024-08-26 22:47:06 * @Version : 1.0.0 * @LastEditors : Jianwei Dong * @LastEditTime : 2024-08-26 22:47:06 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. **/ #include #include "ggml-impl.h" #include "kvcache.h" std::string ggml_type_to_string(ggml_type type) { switch (type) { case GGML_TYPE_F32: return "GGML_TYPE_F32"; case GGML_TYPE_F16: return "GGML_TYPE_F16"; case GGML_TYPE_Q4_0: return "GGML_TYPE_Q4_0"; case GGML_TYPE_Q8_0: return "GGML_TYPE_Q8_0"; } return "UNDIFINED"; } std::string AnchorTypeToString(AnchorType type) { switch (type) { case AnchorType::DYNAMIC: return "DYNAMIC"; case AnchorType::BLOCK_MEAN: return "BLOCK_MEAN"; case AnchorType::BLOCK_MAX: return "BLOCK_MAX"; case AnchorType::FIXED_ANCHOR: return "FIXED_ANCHOR"; case AnchorType::QUEST: return "QUEST"; } return "UNDIFINED"; } std::string RetrievalTypeToString(RetrievalType type) { switch (type) { case RetrievalType::LAYER: return "SHARED"; case RetrievalType::KVHEAD: return "SEPARATE"; case RetrievalType::QHEAD: return "INDIVIDUAL"; } return "UNDIFINED"; } KVCacheConfig::KVCacheConfig(int layer_num, int kv_head_num, int q_head_num, int head_dim, int block_len, int anchor_num, AnchorType anchor_type, ggml_type kv_type, RetrievalType retrieval_type, int layer_step, int token_step, int layer_offset, int max_block_num, int max_batch_size, int max_thread_num) : layer_num(layer_num), kv_head_num(kv_head_num), q_head_num(q_head_num), head_dim(head_dim), block_len(block_len), anchor_num(anchor_num), anchor_type(anchor_type), kv_type(kv_type), retrieval_type(retrieval_type), layer_step(layer_step), token_step(token_step), layer_offset(layer_offset), max_block_num(max_block_num), max_batch_size(max_batch_size), max_thread_num(max_thread_num) { printf( "layer_num: %d, kv_head_num: %d, q_head_num: %d, head_dim: %d, " "block_len: %d, anchor_num: %d, anchor_type: %s, kv_type: %s, " "retrieval_type: %s, layer_step: %d, token_step: %d, layer_offset: %d," "max_block_num: %d, max_batch_size: %d, max_thread_num: %d\n", layer_num, kv_head_num, q_head_num, head_dim, block_len, anchor_num, AnchorTypeToString(anchor_type).c_str(), ggml_type_to_string(kv_type).c_str(), RetrievalTypeToString(retrieval_type).c_str(), layer_step, token_step, layer_offset, max_block_num, max_batch_size, max_thread_num); assert(q_head_num % kv_head_num == 0); } KVCache::KVCache(KVCacheConfig config) { this->config_ = config; n_gqa_ = config_.q_head_num / config_.kv_head_num; if (config_.kv_type == ggml_type::GGML_TYPE_F16) { // TODO: Elegant implement k_cache_fp16_.resize(config_.layer_num); v_cache_fp16_.resize(config_.layer_num); selected_blocks_num_history_.resize(config_.layer_num / config_.layer_step); if (config_.retrieval_type == RetrievalType::LAYER) { selected_blocks_history_.resize(config_.layer_num / config_.layer_step); } else if (config_.retrieval_type == RetrievalType::KVHEAD) { selected_blocks_history_kvhead_.resize(config_.layer_num / config_.layer_step); } else if (config_.retrieval_type == RetrievalType::QHEAD) { } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) { k_cache_q4.resize(config.layer_num); v_cache_q4.resize(config.layer_num); } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) { k_cache_q8.resize(config.layer_num); v_cache_q8.resize(config.layer_num); } else { assert(false); } anchor_.resize(config.layer_num * config.max_block_num * config.anchor_num * config.q_head_num * config.head_dim); importance_.resize(config.layer_num); past_block_num_.resize(config.layer_num); for (int i = 0; i < config.layer_num; i++) { past_block_num_[i] = 0; } ThreadResize(config.max_thread_num); BatchResize(config.max_batch_size); BlockResize(config.max_block_num); q_fp32.resize(n_gqa_ * config.head_dim); } void KVCache::ThreadResize(int thread_num) { thread_local_output_q8_0_.resize(thread_num); thread_local_attn_score_.resize(thread_num); thread_local_output_fp32_.resize(thread_num); thread_local_attn_lse_.resize(thread_num); thread_local_cur_output_fp32_.resize(thread_num); thread_local_cur_attn_lse_.resize(thread_num); thread_local_draft_.resize(thread_num); thread_cur_head_idx_.resize(thread_num); thread_local_attn_mask_.resize(thread_num); for (int i = 0; i < thread_num; i++) { thread_local_output_q8_0_[i].resize(n_gqa_ * config_.head_dim / QK8_0); thread_local_attn_score_[i].resize(n_gqa_ * config_.block_len); thread_local_output_fp32_[i].resize(n_gqa_ * config_.head_dim); thread_local_attn_lse_[i].resize(n_gqa_); thread_local_cur_output_fp32_[i].resize(n_gqa_ * config_.head_dim); thread_local_cur_attn_lse_[i].resize(n_gqa_); thread_local_draft_[i].resize(2 * n_gqa_ * config_.block_len + 6 * n_gqa_ * config_.head_dim + 2 * config_.block_len * config_.head_dim + config_.block_len * config_.head_dim / QK4_0); thread_local_attn_mask_[i].resize(config_.block_len / 8); } } void KVCache::BatchResize(int batch_size) { mutex_.resize(batch_size); q_q8_0_.resize(batch_size); q_fp32_.resize(batch_size); output_fp32_.resize(batch_size); attn_lse_.resize(batch_size); block_lse_.resize(batch_size); attn_sparsity_.resize(batch_size); if (config_.retrieval_type == RetrievalType::LAYER) { block_table_before_retrieval_.resize(batch_size); block_table_after_retrieval_.resize(batch_size); for (int i = 0; i < config_.layer_num / config_.layer_step; i++) { selected_blocks_history_[i].resize(batch_size); } } else if (config_.retrieval_type == RetrievalType::KVHEAD) { block_table_before_retrieval_kvhead_.resize(batch_size); block_table_after_retrieval_kvhead_.resize(batch_size); for (int i = 0; i < config_.layer_num / config_.layer_step; i++) { selected_blocks_history_kvhead_[i].resize(batch_size); } } else if (config_.retrieval_type == RetrievalType::QHEAD) { block_table_before_retrieval_qhead_.resize(batch_size); block_table_after_retrieval_qhead_.resize(batch_size); } cache_seqlens_.resize(batch_size); if (config_.retrieval_type == RetrievalType::LAYER) { block_similar_.resize(batch_size); } else if (config_.retrieval_type == RetrievalType::KVHEAD) { block_similar_kv_head_.resize(batch_size); } else if (config_.retrieval_type == RetrievalType::QHEAD) { block_similar_q_head_.resize(batch_size); } for (int i = 0; i < batch_size; i++) { top_similar_block_.resize(batch_size); mutex_[i].resize(config_.kv_head_num); q_q8_0_[i].resize(config_.kv_head_num); q_fp32_[i].resize(config_.kv_head_num); output_fp32_[i].resize(config_.kv_head_num); attn_lse_[i].resize(config_.kv_head_num); for (int j = 0; j < config_.kv_head_num; j++) { if (!mutex_[i][j]) { mutex_[i][j] = std::make_unique(); } q_q8_0_[i][j].resize(n_gqa_ * config_.head_dim / QK8_0); q_fp32_[i][j].resize(n_gqa_ * config_.head_dim); output_fp32_[i][j].resize(n_gqa_ * config_.head_dim); attn_lse_[i][j].resize(n_gqa_); } } avg_q.resize(batch_size); avg_q_fp16.resize(batch_size); for (int i = 0; i < batch_size; i++) { attn_sparsity_[i].resize(config_.q_head_num); avg_q[i].resize(config_.q_head_num * config_.head_dim); avg_q_fp16[i].resize(config_.q_head_num * config_.head_dim); } } void KVCache::BlockResize(int max_block_num) { sin_.resize(max_block_num * config_.block_len); cos_.resize(max_block_num * config_.block_len); for (int i = 0; i < max_block_num * config_.block_len; i++) { sin_[i].resize(config_.head_dim); cos_[i].resize(config_.head_dim); } for (int i = 0; i < config_.layer_num / config_.layer_step; i++) { for (int j = 0; j < config_.max_batch_size; j++) { if (config_.retrieval_type == RetrievalType::LAYER) { selected_blocks_history_[i][j].resize(max_block_num); } else if (config_.retrieval_type == RetrievalType::KVHEAD) { selected_blocks_history_kvhead_[i][j].resize(max_block_num); for (int k = 0; k < config_.max_block_num; k++) { selected_blocks_history_kvhead_[i][j][k].resize(config_.kv_head_num); } } else if (config_.retrieval_type == RetrievalType::QHEAD) { } } } for (int layer_id = 0; layer_id < config_.layer_num; layer_id++) { importance_[layer_id].resize(max_block_num); if (config_.kv_type == ggml_type::GGML_TYPE_F16) { // TODO: Elegant implement k_cache_fp16_[layer_id].resize(config_.kv_head_num); v_cache_fp16_[layer_id].resize(config_.kv_head_num); for (int i = 0; i < config_.kv_head_num; i++) { k_cache_fp16_[layer_id][i].resize(max_block_num); v_cache_fp16_[layer_id][i].resize(max_block_num); for (int j = 0; j < max_block_num; j++) { k_cache_fp16_[layer_id][i][j].resize(config_.block_len * config_.head_dim); v_cache_fp16_[layer_id][i][j].resize(config_.block_len * config_.head_dim); } } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) { k_cache_q4[layer_id].resize(config_.kv_head_num); v_cache_q4[layer_id].resize(config_.kv_head_num); for (int i = 0; i < config_.kv_head_num; i++) { k_cache_q4[layer_id][i].resize(max_block_num); v_cache_q4[layer_id][i].resize(max_block_num); for (int j = 0; j < max_block_num; j++) { k_cache_q4[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32); v_cache_q4[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32); } } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) { k_cache_q8[layer_id].resize(config_.kv_head_num); v_cache_q8[layer_id].resize(config_.kv_head_num); for (int i = 0; i < config_.kv_head_num; i++) { k_cache_q8[layer_id][i].resize(max_block_num); v_cache_q8[layer_id][i].resize(max_block_num); for (int j = 0; j < max_block_num; j++) { k_cache_q8[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32); v_cache_q8[layer_id][i][j].resize(config_.block_len * config_.head_dim / 32); } } } else { assert(false); } for (int i = 0; i < config_.max_batch_size; i++) { if (config_.retrieval_type == RetrievalType::LAYER) { block_similar_[i].resize(max_block_num); block_table_before_retrieval_[i].resize(max_block_num); block_table_after_retrieval_[i].resize(max_block_num); } else if (config_.retrieval_type == RetrievalType::KVHEAD) { block_similar_kv_head_[i].resize(max_block_num); block_table_before_retrieval_kvhead_[i].resize(max_block_num); block_table_after_retrieval_kvhead_[i].resize(max_block_num); for (int j = 0; j < max_block_num; j++) { block_similar_kv_head_[i][j].resize(config_.kv_head_num); block_table_before_retrieval_kvhead_[i][j].resize(config_.kv_head_num); block_table_after_retrieval_kvhead_[i][j].resize(config_.kv_head_num); } } else if (config_.retrieval_type == RetrievalType::QHEAD) { block_similar_q_head_[i].resize(max_block_num); block_table_before_retrieval_qhead_[i].resize(max_block_num); block_table_after_retrieval_qhead_[i].resize(max_block_num); for (int j = 0; j < max_block_num; j++) { block_similar_q_head_[i][j].resize(config_.q_head_num); block_table_before_retrieval_qhead_[i][j].resize(config_.q_head_num); block_table_after_retrieval_qhead_[i][j].resize(config_.q_head_num); } } block_lse_[i].resize(max_block_num); for (int j = 0; j < max_block_num; j++) { block_lse_[i][j].resize(config_.q_head_num); } } for (int i = 0; i < max_block_num; i++) { importance_[layer_id][i].resize(config_.block_len); for (int j = 0; j < config_.block_len; j++) { importance_[layer_id][i][j].resize(config_.q_head_num); } } } } void KVCache::calc_anchor_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num, WorkerPool* backend) { // Timer start auto start = std::chrono::high_resolution_clock::now(); // Each task updates the importance of a certain block seq_len_ = config_.block_len; backend->do_work_stealing_job( config_.layer_num * batch_size * max_block_num, nullptr, [&](int task_id) { int layer_id = task_id / (batch_size * max_block_num); int batch_id = (task_id / max_block_num) % batch_size; int block_id = task_id % max_block_num; // If the block is out of the sequence length, skip it. In // particular, the last block of the sequence that is shorter than // the block length should be skipped. if (cache_seqlens[batch_id] / config_.block_len < block_id) { return; } int block_idx = block_table[batch_id * max_block_num + block_id]; std::vector block_fp32(32); if (config_.anchor_type == AnchorType::DYNAMIC) { // clear anchor_ for (int anchor_id = 0; anchor_id < 1; anchor_id++) { for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0; } } } // find top anchor_num importances and their corresponding // positions in the importance_ tensor // TODO: Move top_importances to the class member to avoid // repeated memory allocation std::priority_queue>, std::vector>>, std::greater<>> top_importances; for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int k = 0; k < seq_len_; k++) { top_importances.push(std::make_pair(GGML_FP16_TO_FP32(importance_[layer_id][block_idx][k][head_id]), std::make_pair(block_idx, k))); // TODO: change to config_ item if (top_importances.size() > config_.anchor_num) { top_importances.pop(); } } // fill anchor_ for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0; } for (int k = 0; k < config_.anchor_num; k++) { int top_indice = top_importances.top().second.second; int top_block_idx = top_importances.top().second.first; if (config_.kv_type == ggml_type::GGML_TYPE_F16) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16( GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]) + GGML_FP16_TO_FP32(k_cache_fp16_[layer_id][head_id / n_gqa_][top_block_idx] [top_indice * config_.head_dim + l])); } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) { for (int l = 0; l < config_.head_dim / 32; l++) { block_q4_0 block = k_cache_q4[layer_id][head_id / n_gqa_][top_block_idx][top_indice * config_.head_dim / 32 + l]; dequantize_row_q4_0(&block, block_fp32.data(), 32); for (int m = 0; m < 32; m++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] = GGML_FP32_TO_FP16( block_fp32[m] / 4 + GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m])); } } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) { for (int l = 0; l < config_.head_dim / 32; l++) { block_q8_0 block = k_cache_q8[layer_id][head_id / n_gqa_][top_block_idx][top_indice * config_.head_dim / 32 + l]; dequantize_row_q8_0(&block, block_fp32.data(), 32); for (int m = 0; m < 32; m++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] = GGML_FP32_TO_FP16( block_fp32[m] / 4 + GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + top_block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m])); } } } top_importances.pop(); } } } else if (config_.anchor_type == AnchorType::BLOCK_MEAN) { // clear anchor_ for (int anchor_id = 0; anchor_id < config_.anchor_num; anchor_id++) { for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0; } } } // fill anchor_ if (config_.kv_type == ggml_type::GGML_TYPE_F16) { for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int k = 0; k < config_.block_len; k++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16( GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]) + GGML_FP16_TO_FP32( k_cache_fp16_[layer_id][head_id / n_gqa_][block_idx][k * config_.head_dim + l]) / config_.block_len); } } } } } else if (config_.anchor_type == AnchorType::BLOCK_MAX) { // clear anchor_ for (int anchor_id = 0; anchor_id < config_.anchor_num; anchor_id++) { for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0; } } } // fill anchor_ if (config_.kv_type == ggml_type::GGML_TYPE_F16) { for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int k = 0; k < config_.block_len; k++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16(std::max( GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]), GGML_FP16_TO_FP32( k_cache_fp16_[layer_id][head_id / n_gqa_][block_idx][k * config_.head_dim + l]))); } } } } } else if (config_.anchor_type == AnchorType::FIXED_ANCHOR) { // clear anchor_ for (int anchor_id = 0; anchor_id < 1; anchor_id++) { for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + anchor_id * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = 0; } } } // fill anchor_ if (config_.kv_type == ggml_type::GGML_TYPE_F16) { int stride = config_.block_len / config_.anchor_num; for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int k = 0, tot = 0; k < config_.block_len, tot < config_.anchor_num; k += stride, tot++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16( GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]) + GGML_FP16_TO_FP32( k_cache_fp16_[layer_id][head_id / n_gqa_][block_idx][k * config_.head_dim + l]) / config_.anchor_num); } } } } } else if (config_.anchor_type == AnchorType::QUEST) { // clear anchor_ for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16(std::numeric_limits::max()); anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16(std::numeric_limits::min()); } } // fill anchor_ if (config_.kv_type == ggml_type::GGML_TYPE_F16) { for (int indice = 0; indice < seq_len_; indice++) { for (int head_id = 0; head_id < config_.kv_head_num; head_id++) { for (int l = 0; l < config_.head_dim; l++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16(std::max( GGML_FP16_TO_FP32(k_cache_fp16_[layer_id][head_id][block_idx][indice * config_.head_dim + l]), GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]))); anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l] = GGML_FP32_TO_FP16(std::min( GGML_FP16_TO_FP32(k_cache_fp16_[layer_id][head_id][block_idx][indice * config_.head_dim + l]), GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l]))); } } } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) { for (int indice = 0; indice < seq_len_; indice++) { for (int head_id = 0; head_id < config_.kv_head_num; head_id++) { for (int l = 0; l < config_.head_dim / 32; l++) { block_q4_0 block = k_cache_q4[layer_id][head_id][block_idx][indice * config_.head_dim / 32 + l]; dequantize_row_q4_0(&block, block_fp32.data(), 32); for (int m = 0; m < 32; m++) { for (int gqa_idx = 0; gqa_idx < n_gqa_; gqa_idx++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] = GGML_FP32_TO_FP16(std::max( block_fp32[m], GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m]))); anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] = GGML_FP32_TO_FP16(std::min( block_fp32[m], GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m]))); } } } } } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) { for (int indice = 0; indice < seq_len_; indice++) { for (int head_id = 0; head_id < config_.kv_head_num; head_id++) { for (int l = 0; l < config_.head_dim / 32; l++) { block_q8_0 block = k_cache_q8[layer_id][head_id][block_idx][indice * config_.head_dim / 32 + l]; dequantize_row_q8_0(&block, block_fp32.data(), 32); for (int m = 0; m < 32; m++) { for (int gqa_idx = 0; gqa_idx < n_gqa_; gqa_idx++) { anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] = GGML_FP32_TO_FP16(std::max( block_fp32[m], GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 0 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m]))); anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m] = GGML_FP32_TO_FP16(std::min( block_fp32[m], GGML_FP16_TO_FP32( anchor_[layer_id * config_.max_block_num * config_.anchor_num * config_.q_head_num * config_.head_dim + block_idx * config_.anchor_num * config_.q_head_num * config_.head_dim + 1 * config_.q_head_num * config_.head_dim + head_id * config_.head_dim + l * 32 + m]))); } } } } } } } else { assert(false); } }, nullptr); // Timer end auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; // printf("time of calc_anchor_all_layers: %f s\n", duration.count()); } void KVCache::clear_importance_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num, WorkerPool* backend) { // Timer start auto start = std::chrono::high_resolution_clock::now(); // Each task updates the importance of a certain block seq_len_ = config_.block_len; backend->do_work_stealing_job( config_.layer_num * batch_size * max_block_num, nullptr, [&](int task_id) { int layer_id = task_id / (batch_size * max_block_num); int batch_id = (task_id / max_block_num) % batch_size; int block_id = task_id % max_block_num; // If the block is out of the sequence length, skip it. In // particular, the last block of the sequence that is shorter than // the block length should be skipped. if (cache_seqlens[batch_id] / config_.block_len < block_id) { return; } int block_idx = block_table[batch_id * max_block_num + block_id]; if (config_.anchor_type == AnchorType::DYNAMIC) { // clear anchor_ for (int head_id = 0; head_id < config_.q_head_num; head_id++) { for (int l = 0; l < config_.block_len; l++) { importance_[layer_id][block_idx][l][head_id] = 0; } } } }, nullptr); // Timer end auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; // printf("time of clear_importance_all_layerssssss: %f s\n", // duration.count()); } void KVCache::clear_kvcache_all_layers(int* block_table, int* cache_seqlens, int batch_size, int max_block_num, WorkerPool* backend) { // Timer start auto start = std::chrono::high_resolution_clock::now(); // Each task updates the importance of a certain block seq_len_ = config_.block_len; backend->do_work_stealing_job( config_.layer_num * batch_size * max_block_num * config_.kv_head_num, nullptr, [&](int task_id) { int layer_id = task_id / (batch_size * max_block_num * config_.kv_head_num); int batch_id = (task_id / (max_block_num * config_.kv_head_num)) % batch_size; int block_id = task_id / config_.kv_head_num % max_block_num; int head_id = task_id % config_.kv_head_num; // If the block is out of the sequence length, skip it. In // particular, the last block of the sequence that is shorter than // the block length should be skipped. if (cache_seqlens[batch_id] / config_.block_len < block_id) { return; } int block_idx = block_table[batch_id * max_block_num + block_id]; if (config_.kv_type == ggml_type::GGML_TYPE_F16) { for (int l = 0; l < config_.block_len * config_.head_dim; l++) { k_cache_fp16_[layer_id][head_id][block_idx][l] = 0; v_cache_fp16_[layer_id][head_id][block_idx][l] = 0; } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q4_0) { for (int l = 0; l < config_.block_len * config_.head_dim / 32; l++) { k_cache_q4[layer_id][head_id][block_idx][l].d = 0; v_cache_q4[layer_id][head_id][block_idx][l].d = 0; } } else if (config_.kv_type == ggml_type::GGML_TYPE_Q8_0) { for (int l = 0; l < config_.block_len * config_.head_dim / 32; l++) { k_cache_q8[layer_id][head_id][block_idx][l].d = 0; v_cache_q8[layer_id][head_id][block_idx][l].d = 0; } } }, nullptr); // Timer end auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; // printf("time of clear_kvcache_all_layers: %f s\n", duration.count()); } void KVCache::get_sincos(ggml_fp16_t* sin, ggml_fp16_t* cos, int seqlen) { // Timer start auto start = std::chrono::high_resolution_clock::now(); const uint16_t* sin_data = const_cast(sin); const uint16_t* cos_data = const_cast(cos); for (int i = 0; i < seqlen; i++) { for (int j = 0; j < config_.head_dim; j++) { sin_[i][j] = sin_data[i * config_.head_dim + j]; cos_[i][j] = cos_data[i * config_.head_dim + j]; } } // Timer end auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end - start; printf("time of get_sincos: %f s\n", duration.count()); } void ggml_vec_scale_f32(const int n, float* y, const float v) { #if defined(GGML_USE_ACCELERATE) vDSP_vsmul(y, 1, &v, y, 1, n); #elif defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); GGML_F32_VEC ay[GGML_F32_ARR]; for (int i = 0; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; j++) { ay[j] = GGML_F32_VEC_LOAD(y + i + j * GGML_F32_EPR); ay[j] = GGML_F32_VEC_MUL(ay[j], vx); GGML_F32_VEC_STORE(y + i + j * GGML_F32_EPR, ay[j]); } } // leftovers for (int i = np; i < n; ++i) { y[i] *= v; } #else // scalar for (int i = 0; i < n; ++i) { y[i] *= v; } #endif }