mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 07:04:11 +00:00
* gmp-oss: common * gpt-oss: attnetion sinks, swiglu_oai * gpt-oss: WIP llama Model loads and runs (CPU only), but PPL is much to high (~1500 for 1st batch vs ~200 in mainline). Is it because of SWA, because of vocab, or did I introduce a bug somewhere? * gpt-oss: CPU seems to be working It was the SWA thta was missing in the previous commit. There are issues with EOG tokens, so this still needs to be added. * CUDA: ADD_ID Just a copy from mainline * gpt-oss: Seems to be working on CUDA * gpt-oss: add sinks to the attn-vec kernels * CUDA: add head size of 64 to new mma Haven't turned it on yet, but observe slightly better PP and slightly worse TG performance with that. * gpt-oss: add ability to use -fmoe (only CUDA for now) * Move row sums to the write place * Add sinks to iqk flash attention * gpt_oss: Implement -fmoe on the CPU * Simdify swiglu_oai Turning it off for now as performance becomes more variable, so perhaps I'm running into thermal trottling imore often because of making the CPU work too hard. * llama: factor out model loader * Builds successfully * It runs, but mmap does not work * Fix llama_mmap so mmap works * Minor * Fix CUDA after latest changes * Attempt to use CUDA graphs with MoE models - not working * CUDA graphs WIP - still not working * CUDA graphs - seems to be working Likely not all MLA variants are working. I no longer remember why I added the q8_0 cpy that transposes the tensor, but if really needed, this is now missing. Also missing is q6_0. * Make q8_0 cache work for DeepSeek models with CUDA graphs * cuda: cpy for q6_0 * Fix llama_mmap on non-Linux platforms * Adding forgotten file * Iterating on Windows build failures * cuda: re-add q8_0 -> q8_0 transpose so mla = 2 can be used with CUDA graphs and q8_0 cache. * Disable graphs without -fmoe * Minor * Turn graphs on by default --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
69 lines
1.4 KiB
C++
69 lines
1.4 KiB
C++
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
struct llama_file;
|
|
struct llama_mmap;
|
|
struct llama_mlock;
|
|
|
|
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
|
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
|
|
|
struct llama_file {
|
|
llama_file(const char * fname, const char * mode);
|
|
~llama_file();
|
|
|
|
size_t tell() const;
|
|
size_t size() const;
|
|
|
|
int file_id() const; // fileno overload
|
|
|
|
void seek(size_t offset, int whence) const;
|
|
|
|
void read_raw(void * ptr, size_t len) const;
|
|
uint32_t read_u32() const;
|
|
|
|
void write_raw(const void * ptr, size_t len) const;
|
|
void write_u32(uint32_t val) const;
|
|
|
|
private:
|
|
struct impl;
|
|
std::unique_ptr<impl> pimpl;
|
|
};
|
|
|
|
struct llama_mmap {
|
|
llama_mmap(const llama_mmap &) = delete;
|
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false, bool use_thp = false);
|
|
~llama_mmap();
|
|
|
|
size_t size() const;
|
|
void * addr() const;
|
|
|
|
void unmap_fragment(size_t first, size_t last);
|
|
|
|
static const bool SUPPORTED;
|
|
|
|
private:
|
|
struct impl;
|
|
std::unique_ptr<impl> pimpl;
|
|
};
|
|
|
|
struct llama_mlock {
|
|
llama_mlock();
|
|
~llama_mlock();
|
|
|
|
void init(void * ptr);
|
|
void grow_to(size_t target_size);
|
|
|
|
static const bool SUPPORTED;
|
|
|
|
private:
|
|
struct impl;
|
|
std::unique_ptr<impl> pimpl;
|
|
};
|
|
|
|
size_t llama_path_max();
|