Files
ik_llama.cpp/src/llama-model-loader.h
Kawrakow fc06bc9d27 Enable CUDA graphs for MoE models + GPT-OSS support (#689)
* gmp-oss: common

* gpt-oss: attnetion sinks, swiglu_oai

* gpt-oss: WIP llama

Model loads and runs (CPU only), but PPL is much to high
(~1500 for 1st batch vs ~200 in mainline).
Is it because of SWA, because of vocab, or did I introduce a bug somewhere?

* gpt-oss: CPU seems to be working

It was the SWA thta was missing in the previous commit.

There are issues with EOG tokens, so this still needs to be added.

* CUDA: ADD_ID

Just a copy from mainline

* gpt-oss: Seems to be working on CUDA

* gpt-oss: add sinks to the attn-vec kernels

* CUDA: add head size of 64 to new mma

Haven't turned it on yet, but observe slightly better PP and slightly
worse TG performance with that.

* gpt-oss: add ability to use -fmoe (only CUDA for now)

* Move row sums to the write place

* Add sinks to iqk flash attention

* gpt_oss: Implement -fmoe on the CPU

* Simdify swiglu_oai

Turning it off for now as performance becomes more variable,
so perhaps I'm running into thermal trottling imore often
because of making the CPU work too hard.

* llama: factor out model loader

* Builds successfully

* It runs, but mmap does not work

* Fix llama_mmap so mmap works

* Minor

* Fix CUDA after latest changes

* Attempt to use CUDA graphs with MoE models - not working

* CUDA graphs WIP - still not working

* CUDA graphs - seems to be working

Likely not all MLA variants are working.
I no longer remember why I added the q8_0 cpy that
transposes the tensor, but if really needed, this is now
missing. Also missing is q6_0.

* Make q8_0 cache work for DeepSeek models with CUDA graphs

* cuda: cpy for q6_0

* Fix llama_mmap on non-Linux platforms

* Adding forgotten file

* Iterating on Windows build failures

* cuda: re-add q8_0 -> q8_0 transpose

so mla = 2 can be used with CUDA graphs and q8_0 cache.

* Disable graphs without -fmoe

* Minor

* Turn graphs on by default

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2025-08-15 09:18:07 +03:00

170 lines
5.9 KiB
C++

#pragma once
#include "llama.h"
#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-arch.h"
#include <cstdint>
#include <cstddef>
#include <stdexcept>
#include <unordered_map>
#include <vector>
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
GGUF_FILE_VERSION_V3 = 3,
};
static const char * llama_file_version_name(llama_fver version) {
switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
case GGUF_FILE_VERSION_V2: return "GGUF V2";
case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
}
return "unknown";
}
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
struct llama_model_loader {
int n_kv = 0;
int n_tensors = 0;
int n_created = 0;
int64_t n_elements = 0;
size_t n_bytes = 0;
bool use_mmap = false;
bool check_tensors;
bool repack_tensors = false;
bool use_thp = false;
llama_files files;
llama_ftype ftype;
llama_fver fver;
llama_mmaps mappings;
// Holds information on a model weight
struct llama_tensor_weight {
uint16_t idx; // source file index
size_t offs; // tensor data offset in the original file
ggml_tensor * tensor;
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
}
}
};
std::vector<llama_tensor_weight> weights;
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
const llama_model_tensor_buft_override * tensor_buft_overrides;
gguf_context * meta = NULL;
std::vector<ggml_context *> contexts;
std::string arch_name;
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
const llama_model_kv_override * param_overrides_p,
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
~llama_model_loader();
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
get_arr_n(const std::string & key, T & result, const bool required = true);
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
get_arr_n(const enum llm_kv kid, T & result, const bool required = true);
template<typename T>
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true);
template<typename T, size_t N_MAX>
bool get_arr(const std::string & key, std::array<T, N_MAX> & result, const bool required = true);
template<typename T>
bool get_arr(const enum llm_kv kid, T & result, const bool required = true);
template<typename T>
bool get_key(const std::string & key, T & result, const bool required = true);
template<typename T>
bool get_key(const enum llm_kv kid, T & result, const bool required = true);
// get array of n <= N_MAX elements, or a single element repeated n times
template<typename T, size_t N_MAX>
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true);
template<typename T>
bool get_key_or_arr(const enum llm_kv kid, T & result, uint32_t n, const bool required = true);
const std::string& get_arch_name() const { return arch_name; }
enum llm_arch get_arch() const { return llm_kv.arch; }
const char * get_tensor_name(int i) const;
const llama_tensor_weight * get_weight(const char * name) const;
const llama_tensor_weight * get_weight(int i) const {
return get_weight(get_tensor_name(i));
}
const llama_tensor_weight & require_weight(const char * name) const;
struct ggml_tensor * get_tensor_meta(const char * name) const;
struct ggml_tensor * require_tensor_meta(const char * name) const;
struct ggml_tensor * get_tensor_meta(int i) const {
return get_tensor_meta(get_tensor_name(i));
}
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated);
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
static const int TENSOR_NOT_REQUIRED = 1 << 0;
static const int TENSOR_DUPLICATED = 1 << 1;
static const int TENSOR_SKIP = 1 << 2;
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0);
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base,
const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true);
void done_getting_tensors() const;
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr, bool use_thp = false);
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
// for backwards compatibility, does not support ggml-backend
void load_data_for(struct ggml_tensor * cur) const;
size_t size_done = 0;
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;
// Returns false if cancelled by progress_callback
bool load_all_data(
struct ggml_context * ctx,
llama_buf_map & bufs_mmap,
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data);
};