diff --git a/common/common.cpp b/common/common.cpp index 08249ab7..f4074a61 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2513,7 +2513,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max }); options.push_back({ "*", "--draft-min, --draft-n-min N", "minimum number of draft tokens to use for speculative decoding" }); options.push_back({ "*", "--draft-p-min P", "minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min }); - options.push_back({ "*", "--spec-type Name", "[none | ngram - cache | ngram - simple | ngram - map - k | ngram - map - k4v | ngram - mod]", "type of speculative decoding to use when no draft model is provided (default: %s)\n", (double)params.speculative.type}); + options.push_back({ "*", "--spec-type Name [none | ngram - cache | ngram - simple | ngram - map - k | ngram - map - k4v | ngram - mod]", "type of speculative decoding to use when no draft model is provided (default: %d)\n", (int)params.speculative.type}); options.push_back({ "*", "--spec-ngram-size-n N", "ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)\n",params.speculative.ngram_size_n }); options.push_back({ "*", "--spec-ngram-size-m N", "ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)\n", params.speculative.ngram_size_m }); diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 08f8bf0c..0d4077bf 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -36,15 +36,15 @@ foreach(asset ${PUBLIC_ASSETS}) OUTPUT "${output}" COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" ) - message("TARGET_SRCS contains: ${input}") - set_source_files_properties(${output} PROPERTIES GENERATED TRUE) - + message("TARGET_SRCS contains: ${input}") + set_source_files_properties(${output} PROPERTIES GENERATED TRUE) + endforeach() # include new llamacpp webui set(ALT_PUBLIC_ASSETS index_llamacpp.html.gz - loading.html + loading.html ) foreach(asset ${ALT_PUBLIC_ASSETS}) @@ -56,9 +56,9 @@ foreach(asset ${ALT_PUBLIC_ASSETS}) OUTPUT "${output}" COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" ) - message("TARGET_SRCS contains: ${input}") - set_source_files_properties(${output} PROPERTIES GENERATED TRUE) - + message("TARGET_SRCS contains: ${input}") + set_source_files_properties(${output} PROPERTIES GENERATED TRUE) + endforeach() @@ -68,14 +68,14 @@ target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) if (MSVC) - target_link_options(${TARGET} PRIVATE - $<$:/STACK:20971520,1048576 > - $<$:/STACK:20971520,1048576> - ) + target_link_options(${TARGET} PRIVATE + $<$:/STACK:20971520,1048576 > + $<$:/STACK:20971520,1048576> + ) endif() # target_link_libraries(${TARGET} PRIVATE "/STACK:104857600") target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) -target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) +#target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../mtmd) target_link_libraries(${TARGET} PRIVATE common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index da0f4965..ea99b068 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -1534,7 +1534,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { result.probs.push_back({ cur_p->data[i].id, - common_token_to_piece(ctx, {cur_p->data[i].id}, special), + common_token_to_piece(ctx, cur_p->data[i].id, special), cur_p->data[i].p }); } @@ -1550,7 +1550,7 @@ void server_context::populate_token_probs(const server_slot& slot, completion_to for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { result.probs.push_back({ cur[i].id, - common_token_to_piece(ctx, {cur[i].id}, special), + common_token_to_piece(ctx, cur[i].id, special), cur[i].p }); } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 781deb1c..3c103e6a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -773,7 +773,7 @@ extern "C" { struct ggml_tensor { enum ggml_type type; - GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor"); + enum ggml_backend_type __backend; struct ggml_backend_buffer * buffer; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 104ad664..cfba2e5f 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -4158,7 +4158,7 @@ GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t return ctx->all_data; } -GGML_CALL void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { memset((char *)tensor->data + offset, value, size); GGML_UNUSED(buffer); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 059b589c..c4896d44 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -24763,7 +24763,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_ADD_ID: { GGML_ABORT("fatal error"); // TODO: implement - } break; + } case GGML_OP_ADD1: { if (src0->grad) { diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 66cfed83..19ddcea4 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -8261,13 +8261,14 @@ float QuantizerIQKT::find_best } template -void QuantizerIQKT::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const { +void QuantizerIQKT::find_best_match(float d, + [[maybe_unused]] const float * xb, [[maybe_unused]] const float * weight, int * best_idx) const { if (!d) { std::memset(best_idx, 0, kNg*sizeof(int)); return; } - int ncluster = m_clusters.size()/kGroupSize; - float id = 1/d; + [[maybe_unused]] int ncluster = m_clusters.size()/kGroupSize; + [[maybe_unused]] float id = 1/d; #ifdef __AVX2__ if constexpr (kGroupSize == 8) { __m256 sqx[8]; @@ -9802,8 +9803,8 @@ bool check_tensor_for_blocks_256_fp16(const ggml_tensor * tensor) { nbad += check_row_for_blocks_256_fp16(nblock, x); } if (nbad > 0) { - fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__, - nbad, ggml_nrows(tensor)*nblock, tensor->name); + fprintf(stderr, "%s: found %d NaN block scales out of %g blocks in tensor %s\n", __func__, + nbad, 1.*ggml_nrows(tensor)*nblock, tensor->name); if (tensor->ne[2] > 1) { int nb = tensor->ne[0]/QK_K; for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) { @@ -9813,7 +9814,7 @@ bool check_tensor_for_blocks_256_fp16(const ggml_tensor * tensor) { auto xr = (const Block *)(xex + i01*tensor->nb[1]); nbad_expert += check_row_for_blocks_256_fp16(nb, xr); } - if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %ld\n", nbad_expert, i02); + if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %g\n", nbad_expert, 1.*i02); } } return false; @@ -9839,8 +9840,8 @@ bool check_tensor_for_blocks_256_fp16_repacked(const ggml_tensor * tensor) { nbad += check_row_for_blocks_256_fp16(nblock, x, nr); } if (nbad > 0) { - fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__, - nbad, ggml_nrows(tensor)*nblock, tensor->name); + fprintf(stderr, "%s: found %d NaN block scales out of %g blocks in tensor %s\n", __func__, + nbad, 1.*ggml_nrows(tensor)*nblock, tensor->name); if (tensor->ne[2] > 1) { int nb = tensor->ne[0]/QK_K; for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) { @@ -9850,7 +9851,7 @@ bool check_tensor_for_blocks_256_fp16_repacked(const ggml_tensor * tensor) { auto xr = (const Block *)(xex + i01*tensor->nb[1]); nbad_expert += check_row_for_blocks_256_fp16(nb, xr, nr); } - if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %ld\n", nbad_expert, i02); + if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %g\n", nbad_expert, 1.*i02); } } return false; diff --git a/include/llama.h b/include/llama.h index cd8da575..62e0dcdb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -575,6 +575,7 @@ extern "C" { LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); + struct llama_vocab; LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab); LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); @@ -586,7 +587,7 @@ extern "C" { LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd_inp(const struct llama_model* model); - + LLAMA_API int32_t llama_n_layer (const struct llama_model * model); // Compat diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 4a65c9cb..2fa0db9a 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -275,7 +275,7 @@ struct llama_mmap::impl { #ifdef _POSIX_MAPPED_FILES std::vector> mapped_fragments; - impl(struct llama_file * file, size_t prefetch, bool numa, bool use_thp) { + impl(struct llama_file * file, size_t prefetch, bool numa, [[maybe_unused]] bool use_thp) { size = file->size(); int fd = file->file_id(); int flags = MAP_SHARED; diff --git a/src/llama.cpp b/src/llama.cpp index a477f42c..22ecc4a4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1708,8 +1708,8 @@ static void llm_prepare_mla(llama_model & model, int mla) { l.wk_b = l.computed_wk_b.get(); model.tensors_by_name.push_back(std::make_pair(name, l.wk_b)); - printf("Computed %s as %ld x %ld x %ld and stored in buffer %s\n", name.c_str(), wk_b->ne[0], wk_b->ne[1], wk_b->ne[2], - ggml_backend_buffer_name(l.computed_wk_b->buffer)); + printf("Computed %s as %d x %d x %d of type %s and stored in buffer %s\n", name.c_str(), (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], + ggml_type_name(wk_b->type), ggml_backend_buffer_name(l.computed_wk_b->buffer)); ggml_graph_clear(graph); auto wv_b = ggml_cont(ctx, ggml_view_3d(ctx, &wkv_b, kv_lora_rank, n_embd_head_v, n_head, @@ -1740,8 +1740,8 @@ static void llm_prepare_mla(llama_model & model, int mla) { l.wv_b = l.computed_wv_b.get(); model.tensors_by_name.push_back(std::make_pair(name, l.wv_b)); - printf("Computed %s as %ld x %ld x %ld and stored in buffer %s\n", name.c_str(), wv_b->ne[0], wv_b->ne[1], wv_b->ne[2], - ggml_backend_buffer_name(l.computed_wv_b->buffer)); + printf("Computed %s as %d x %d x %d of type %s and stored in buffer %s\n", name.c_str(), (int)wv_b->ne[0], (int)wv_b->ne[1], (int)wv_b->ne[2], + ggml_type_name(wv_b->type), ggml_backend_buffer_name(l.computed_wv_b->buffer)); ggml_graph_clear(graph); } @@ -1875,8 +1875,8 @@ static void llm_prepare_mla(llama_model & model, int mla) { l.wkv_b = l.computed_wkv_b.get(); model.tensors_by_name.push_back(std::make_pair(name, l.wkv_b)); - printf("Computed %s as %ld x %ld and stored in buffer %s\n", name.c_str(), wkv_b->ne[0], wkv_b->ne[1], - ggml_backend_buffer_name(l.computed_wkv_b->buffer)); + printf("Computed %s as %d x %d of type %s and stored in buffer %s\n", name.c_str(), (int)wkv_b->ne[0], (int)wkv_b->ne[1], + ggml_type_name(wkv_b->type), ggml_backend_buffer_name(l.computed_wkv_b->buffer)); ggml_graph_clear(graph); } @@ -6349,7 +6349,7 @@ struct llama_data_read { } continue; } - const uint64_t k_size_row = (ctx->cparams.mla_attn == 0) ? ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa) : ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope); + const size_t k_size_row = (ctx->cparams.mla_attn == 0) ? ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa) : ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope); if (k_size_row != k_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); return false;