diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9ec8e518..695dc722 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18739,7 +18739,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( dst->ne[2], dst->ne[1], dst->nb[1], k->type, v->type, Dk, Dv, neq1, nek1, q->nb[1], k->nb[1], v->nb[1], mask->nb[1], - q->data, k->data, v->data, mask->data, sinks->data, + q->data, k->data, v->data, mask->data, sinks ? sinks->data : NULL, scale, softcap, (float *)dst->data, params->wdata, (barrier_t)ggml_barrier, (void *)params->shared, ith, nth)) return; diff --git a/src/llama-impl.h b/src/llama-impl.h index 08005b21..befdc559 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -38,6 +38,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) // // helpers diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index dbae465d..d6c42de8 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2333,7 +2333,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else { // token is control, but not marked as EOG -> print a debug log if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) { - LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n", + LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", __func__, t.second, t.first.c_str()); } } @@ -2568,7 +2568,7 @@ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const { } void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { - LLAMA_LOG_INFO("%s: initializing tokenizer for type %d\n", __func__, type); + LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type); switch (type) { case LLAMA_VOCAB_TYPE_SPM: diff --git a/src/llama.cpp b/src/llama.cpp index ee3f03df..56b0d30d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -346,6 +346,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" }, { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, @@ -7326,7 +7327,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); } try { - model.vocab.load(ml, LLM_KV(model.arch)); + LLM_KV kv(model.arch); + model.vocab.load(ml, kv); } catch(const std::exception & e) { throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); }