mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-20 13:14:09 +00:00
* gmp-oss: common * gpt-oss: attnetion sinks, swiglu_oai * gpt-oss: WIP llama Model loads and runs (CPU only), but PPL is much to high (~1500 for 1st batch vs ~200 in mainline). Is it because of SWA, because of vocab, or did I introduce a bug somewhere? * gpt-oss: CPU seems to be working It was the SWA thta was missing in the previous commit. There are issues with EOG tokens, so this still needs to be added. * CUDA: ADD_ID Just a copy from mainline * gpt-oss: Seems to be working on CUDA * gpt-oss: add sinks to the attn-vec kernels * CUDA: add head size of 64 to new mma Haven't turned it on yet, but observe slightly better PP and slightly worse TG performance with that. * gpt-oss: add ability to use -fmoe (only CUDA for now) * Move row sums to the write place * Add sinks to iqk flash attention * gpt_oss: Implement -fmoe on the CPU * Simdify swiglu_oai Turning it off for now as performance becomes more variable, so perhaps I'm running into thermal trottling imore often because of making the CPU work too hard. * llama: factor out model loader * Builds successfully * It runs, but mmap does not work * Fix llama_mmap so mmap works * Minor * Fix CUDA after latest changes * Attempt to use CUDA graphs with MoE models - not working * CUDA graphs WIP - still not working * CUDA graphs - seems to be working Likely not all MLA variants are working. I no longer remember why I added the q8_0 cpy that transposes the tensor, but if really needed, this is now missing. Also missing is q6_0. * Make q8_0 cache work for DeepSeek models with CUDA graphs * cuda: cpy for q6_0 * Fix llama_mmap on non-Linux platforms * Adding forgotten file * Iterating on Windows build failures * cuda: re-add q8_0 -> q8_0 transpose so mla = 2 can be used with CUDA graphs and q8_0 cache. * Disable graphs without -fmoe * Minor * Turn graphs on by default --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
221 lines
5.5 KiB
C++
221 lines
5.5 KiB
C++
//
|
|
// Copyright (C) 2023-2025 The llama.cpp authors
|
|
// Copyright (C) 2024-2025 Iwan Kawrakow
|
|
// MIT license
|
|
// SPDX-License-Identifier: MIT
|
|
//
|
|
|
|
#pragma once
|
|
|
|
#define LLAMA_API_INTERNAL
|
|
#include "llama.h"
|
|
#include <stdexcept>
|
|
#include <climits>
|
|
#include <cstdarg>
|
|
#include <vector>
|
|
#include <cinttypes>
|
|
#include <cstring>
|
|
|
|
#ifdef __GNUC__
|
|
#ifdef __MINGW32__
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
#else
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
#endif
|
|
#else
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
|
#endif
|
|
|
|
|
|
//
|
|
// logging
|
|
//
|
|
|
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
|
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
|
|
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
|
|
|
//
|
|
// helpers
|
|
//
|
|
|
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
if (search.empty()) {
|
|
return;
|
|
}
|
|
std::string builder;
|
|
builder.reserve(s.length());
|
|
size_t pos = 0;
|
|
size_t last_pos = 0;
|
|
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
builder.append(s, last_pos, pos - last_pos);
|
|
builder.append(replace);
|
|
last_pos = pos + search.length();
|
|
}
|
|
builder.append(s, last_pos, std::string::npos);
|
|
s = std::move(builder);
|
|
}
|
|
|
|
|
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
|
template<typename T>
|
|
struct ring_buffer {
|
|
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
|
|
|
T& front() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[first];
|
|
}
|
|
|
|
const T& front() const {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[first];
|
|
}
|
|
|
|
T& back() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[pos];
|
|
}
|
|
|
|
const T& back() const {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[pos];
|
|
}
|
|
|
|
void push_back(const T& value) {
|
|
if (capacity == 0) {
|
|
throw std::runtime_error("ring buffer: capacity is zero");
|
|
}
|
|
|
|
if (sz == capacity) {
|
|
// advance the start when buffer is full
|
|
first = (first + 1) % capacity;
|
|
}
|
|
else {
|
|
sz++;
|
|
}
|
|
data[pos] = value;
|
|
pos = (pos + 1) % capacity;
|
|
}
|
|
|
|
T pop_front() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
T value = data[first];
|
|
first = (first + 1) % capacity;
|
|
sz--;
|
|
return value;
|
|
}
|
|
|
|
//T & operator[](size_t i) {
|
|
// if (i >= sz) {
|
|
// throw std::runtime_error("ring buffer: index out of bounds");
|
|
// }
|
|
// return data[(first + i) % capacity];
|
|
//}
|
|
|
|
//const T & at(size_t i) const {
|
|
// if (i >= sz) {
|
|
// throw std::runtime_error("ring buffer: index out of bounds");
|
|
// }
|
|
// return data[(first + i) % capacity];
|
|
//}
|
|
|
|
const T& rat(size_t i) const {
|
|
if (i >= sz) {
|
|
throw std::runtime_error("ring buffer: index out of bounds");
|
|
}
|
|
return data[(first + sz - i - 1) % capacity];
|
|
}
|
|
|
|
std::vector<T> to_vector() const {
|
|
std::vector<T> result;
|
|
result.reserve(sz);
|
|
for (size_t i = 0; i < sz; i++) {
|
|
result.push_back(data[(first + i) % capacity]);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void clear() {
|
|
// here only reset the status of the buffer
|
|
sz = 0;
|
|
first = 0;
|
|
pos = 0;
|
|
}
|
|
|
|
bool empty() const {
|
|
return sz == 0;
|
|
}
|
|
|
|
size_t size() const {
|
|
return sz;
|
|
}
|
|
|
|
size_t capacity = 0;
|
|
size_t sz = 0;
|
|
size_t first = 0;
|
|
size_t pos = 0;
|
|
std::vector<T> data;
|
|
};
|
|
|
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
|
static std::string format(const char * fmt, ...) {
|
|
va_list ap;
|
|
va_list ap2;
|
|
va_start(ap, fmt);
|
|
va_copy(ap2, ap);
|
|
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
std::vector<char> buf(size + 1);
|
|
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
GGML_ASSERT(size2 == size);
|
|
va_end(ap2);
|
|
va_end(ap);
|
|
return std::string(buf.data(), size);
|
|
}
|
|
|
|
static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
|
char buf[256];
|
|
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
|
for (size_t i = 1; i < ne.size(); i++) {
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
char buf[256];
|
|
snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
|
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
template <typename T>
|
|
struct no_init {
|
|
T value;
|
|
no_init() { /* do nothing */ }
|
|
};
|
|
|
|
|
|
struct gguf_context;
|
|
std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);
|
|
|
|
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
|