mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-21 13:44:10 +00:00
* Remove most of split mode row
* WIP
* WIP: also allocate the KV cache using tensor split
* WIP: it runs with wrong result
But it also looks like the backend scheduler is not going to help:
* It copies mask and input positions to GPU 0
* => RoPE ops must run on GPU 0
* => To proceed attn evaluation, GPU 1 must wait for GPU 0 to finish its
entire attn calculation
* Same with FFN. The rms_norm gets scheduled on GPU 0. Hence, GPU 1 must
wait for GPU 0 to finish its entore FFN calculation before it can
start (as it needs to copy the result of rms_norm from GPU 0)
* => Seems useless without writing a bespoke TP scheduling
* WIP
* This works, but it is slow
* This is slightly better
the graph is still not being computed in parallel.
Why? Because the scheduler creates graph splits where the
result of the computation on one GPU becomes an input for the
other split. Hence, to trigger the computation on the second GPU
one needs to wait for the computation on the first GPU to finish,
even thiough the two can be done in parallel up to the sunchronization
point. So, all that is left to do is to trick the scheduler to create
to splits that can be done in parallel, and then have a graph split
where the results get combined.
* Playing games with the scheduler
This change tricks it into doing the right thing^TM.
Still quite a bit slower than split mode layer for the 8B LlaMA model.
But for the 70B LlaMA it now beats split mode layer for TG:
28 t/s vs 24.4 t/s. PP is 627 t/s vs 744 t/s.
In comparison, split mode "row" in mainline gets
484 t/s PP and 19.3 t/s TG.
* Fix attn split
Granularity for Wq, Wo is not just head size, but
head size * gqa_ratio.
Else the Wk, Wv tensors end up not being a multiple of the
head size when we divide the split determined by Wo with
the gqa_ratio.
* Show memory used per device
* Make it work with partial offload
but no tensor overrides yet, just ngl < num_layers.
* Allow for f16 source in fused_rms_norm
* This results in faster PP.
Now PP is faster than split mode layer for L3-70B.
* Rename split mode "row" to split mode "graph"
* Leave FFN partial results as f16
* WIP GLM4.5 - runs with wrong results
* WIP GLM4.5 - this works
PP is already better than split mode layer, but TG for zero context
is kind of low - 60 vs 92 t/s. TG becomes better than split mode layer
at around 20k tokens. PP at 26k tokens is 1.55X of sm layer.
* Work around compiler bug
It issues a warning that there is an extra semicolon outside of a function,
but there isn't. If I remove the anonymous namespace and turn the
functions inside into static, the warning disapears, so clearly
a compiler bug.
* Make graph reuse work with split mode graph
* Remove more split mode row remnants
* WIP tensor overrides
Runs with wrong results, don't see where the issue could be.
* This works but is slow
Still does not work for row-interleaved quants
* Slightly better
* Slightly better
* Row-interleaved quants work
* Better
* Minor
* Guarad against using split mode "graph" for unsupported models
* Guards against using merge_qkv with split mode "graph"
* WIP split mode attn
Works for LlaMA models, but not for GLM-4.5.
Doesn't seem to improve performance, so I guess no point in trying to
fix it.
* Split mode graph for qwen3moe
* Try to better distribute the splits
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
232 lines
5.8 KiB
C++
232 lines
5.8 KiB
C++
//
|
|
// Copyright (C) 2023-2025 The llama.cpp authors
|
|
// Copyright (C) 2024-2025 Iwan Kawrakow
|
|
// MIT license
|
|
// SPDX-License-Identifier: MIT
|
|
//
|
|
|
|
#pragma once
|
|
|
|
#define LLAMA_API_INTERNAL
|
|
#include "llama.h"
|
|
#include <stdexcept>
|
|
#include <climits>
|
|
#include <cstdarg>
|
|
#include <vector>
|
|
#include <cinttypes>
|
|
#include <cstring>
|
|
#include <string>
|
|
|
|
#ifdef __GNUC__
|
|
#ifdef __MINGW32__
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
#else
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
#endif
|
|
#else
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
|
#endif
|
|
|
|
|
|
//
|
|
// logging
|
|
//
|
|
|
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
|
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
|
|
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
#ifdef NDEBUG
|
|
// Release mode - make LLAMA_LOG_DEBUG a no-op
|
|
#define LLAMA_LOG_DEBUG(...) ((void)0)
|
|
#else
|
|
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
|
#endif
|
|
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
|
|
//
|
|
// helpers
|
|
//
|
|
|
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
if (search.empty()) {
|
|
return;
|
|
}
|
|
std::string builder;
|
|
builder.reserve(s.length());
|
|
size_t pos = 0;
|
|
size_t last_pos = 0;
|
|
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
builder.append(s, last_pos, pos - last_pos);
|
|
builder.append(replace);
|
|
last_pos = pos + search.length();
|
|
}
|
|
builder.append(s, last_pos, std::string::npos);
|
|
s = std::move(builder);
|
|
}
|
|
|
|
|
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
|
template<typename T>
|
|
struct ring_buffer {
|
|
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
|
|
|
T& front() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[first];
|
|
}
|
|
|
|
const T& front() const {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[first];
|
|
}
|
|
|
|
T& back() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[pos];
|
|
}
|
|
|
|
const T& back() const {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
return data[pos];
|
|
}
|
|
|
|
void push_back(const T& value) {
|
|
if (capacity == 0) {
|
|
throw std::runtime_error("ring buffer: capacity is zero");
|
|
}
|
|
|
|
if (sz == capacity) {
|
|
// advance the start when buffer is full
|
|
first = (first + 1) % capacity;
|
|
}
|
|
else {
|
|
sz++;
|
|
}
|
|
data[pos] = value;
|
|
pos = (pos + 1) % capacity;
|
|
}
|
|
|
|
T pop_front() {
|
|
if (sz == 0) {
|
|
throw std::runtime_error("ring buffer is empty");
|
|
}
|
|
T value = data[first];
|
|
first = (first + 1) % capacity;
|
|
sz--;
|
|
return value;
|
|
}
|
|
|
|
//T & operator[](size_t i) {
|
|
// if (i >= sz) {
|
|
// throw std::runtime_error("ring buffer: index out of bounds");
|
|
// }
|
|
// return data[(first + i) % capacity];
|
|
//}
|
|
|
|
//const T & at(size_t i) const {
|
|
// if (i >= sz) {
|
|
// throw std::runtime_error("ring buffer: index out of bounds");
|
|
// }
|
|
// return data[(first + i) % capacity];
|
|
//}
|
|
|
|
const T& rat(size_t i) const {
|
|
if (i >= sz) {
|
|
throw std::runtime_error("ring buffer: index out of bounds");
|
|
}
|
|
return data[(first + sz - i - 1) % capacity];
|
|
}
|
|
|
|
std::vector<T> to_vector() const {
|
|
std::vector<T> result;
|
|
result.reserve(sz);
|
|
for (size_t i = 0; i < sz; i++) {
|
|
result.push_back(data[(first + i) % capacity]);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void clear() {
|
|
// here only reset the status of the buffer
|
|
sz = 0;
|
|
first = 0;
|
|
pos = 0;
|
|
}
|
|
|
|
bool empty() const {
|
|
return sz == 0;
|
|
}
|
|
|
|
size_t size() const {
|
|
return sz;
|
|
}
|
|
|
|
size_t capacity = 0;
|
|
size_t sz = 0;
|
|
size_t first = 0;
|
|
size_t pos = 0;
|
|
std::vector<T> data;
|
|
};
|
|
|
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
|
static std::string format(const char * fmt, ...) {
|
|
va_list ap;
|
|
va_list ap2;
|
|
va_start(ap, fmt);
|
|
va_copy(ap2, ap);
|
|
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
std::vector<char> buf(size + 1);
|
|
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
GGML_ASSERT(size2 == size);
|
|
va_end(ap2);
|
|
va_end(ap);
|
|
return std::string(buf.data(), size);
|
|
}
|
|
|
|
static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
|
char buf[256];
|
|
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
|
for (size_t i = 1; i < ne.size(); i++) {
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
char buf[256];
|
|
snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
|
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
template <typename T>
|
|
struct no_init {
|
|
T value;
|
|
no_init() { /* do nothing */ }
|
|
};
|
|
|
|
|
|
struct gguf_context;
|
|
std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i);
|
|
|
|
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer);
|
|
|
|
struct llama_split_tensor {
|
|
std::vector<ggml_tensor *> tensor_splits;
|
|
ggml_split_tensor_t ggml;
|
|
};
|