mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-21 23:19:22 +00:00
Fix llama_mmap so mmap works
This commit is contained in:
1065
src/llama-mmap.cpp
1065
src/llama-mmap.cpp
File diff suppressed because it is too large
Load Diff
187
src/llama-mmap.h
187
src/llama-mmap.h
@@ -1,175 +1,68 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX MAX_PATH
|
||||
#endif
|
||||
#include <io.h>
|
||||
#endif
|
||||
struct llama_file;
|
||||
struct llama_mmap;
|
||||
struct llama_mlock;
|
||||
|
||||
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
||||
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||
|
||||
struct llama_file {
|
||||
llama_file(const char * fname, const char * mode);
|
||||
~llama_file();
|
||||
|
||||
#if defined(_WIN32)
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
HANDLE fp_win32;
|
||||
size_t size;
|
||||
size_t tell() const;
|
||||
size_t size() const;
|
||||
|
||||
int file_id() const; // fileno overload
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
void read_raw(void * ptr, size_t len) const;
|
||||
uint32_t read_u32() const;
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const;
|
||||
void write_u32(uint32_t val) const;
|
||||
|
||||
private:
|
||||
std::string GetErrorMessageWin32(DWORD error_code) const;
|
||||
|
||||
public:
|
||||
|
||||
llama_file(const char * fname, const char * mode);
|
||||
|
||||
size_t tell() const;
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
void read_raw(void * ptr, size_t len) const;
|
||||
|
||||
uint32_t read_u32() const {
|
||||
uint32_t val;
|
||||
read_raw(&val, sizeof(val));
|
||||
return val;
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const;
|
||||
|
||||
void write_u32(std::uint32_t val) const {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file();
|
||||
#else
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_file(const char * fname, const char * mode);
|
||||
|
||||
size_t tell() const;
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
void read_raw(void * ptr, size_t len) const;
|
||||
|
||||
uint32_t read_u32() const {
|
||||
uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const;
|
||||
|
||||
void write_u32(std::uint32_t val) const {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file();
|
||||
#endif
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
||||
|
||||
struct llama_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
size_t mapped_page_size = 0;
|
||||
|
||||
llama_mmap(const llama_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
// list of mapped fragments (first_offset, last_offset)
|
||||
std::vector<std::pair<size_t, size_t>> mapped_fragments;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false, bool use_thp = false);
|
||||
|
||||
static void align_range(size_t * first, size_t * last, size_t page_size) {
|
||||
// align first to the next page
|
||||
size_t offset_in_page = *first & (page_size - 1);
|
||||
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
|
||||
*first += offset_to_page;
|
||||
|
||||
// align last to the previous page
|
||||
*last = *last & ~(page_size - 1);
|
||||
|
||||
if (*last <= *first) {
|
||||
*last = *first;
|
||||
}
|
||||
}
|
||||
|
||||
// partially unmap the file in the range [first, last)
|
||||
void unmap_fragment(size_t first, size_t last);
|
||||
|
||||
#ifdef __linux__
|
||||
static int get_default_huge_page_size();
|
||||
#endif
|
||||
|
||||
~llama_mmap();
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false, bool use_thp = false);
|
||||
|
||||
void unmap_fragment(size_t first, size_t last);
|
||||
|
||||
~llama_mmap();
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false, bool use_thp = false);
|
||||
size_t size() const;
|
||||
void * addr() const;
|
||||
|
||||
void unmap_fragment(size_t first, size_t last);
|
||||
#endif
|
||||
|
||||
static const bool SUPPORTED;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct llama_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
bool failed_already = false;
|
||||
|
||||
llama_mlock() {}
|
||||
llama_mlock(const llama_mlock &) = delete;
|
||||
|
||||
~llama_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
llama_mlock();
|
||||
~llama_mlock();
|
||||
|
||||
void init(void * ptr);
|
||||
|
||||
void grow_to(size_t target_size);
|
||||
|
||||
static size_t lock_granularity();
|
||||
static const bool SUPPORTED;
|
||||
|
||||
bool raw_lock(void * ptr, size_t len) const;
|
||||
|
||||
static void raw_unlock(void * ptr, size_t len);
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
#endif
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||
|
||||
size_t llama_path_max();
|
||||
|
||||
@@ -781,10 +781,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
||||
mmaps_used.reserve(files.size());
|
||||
for (const auto & file : files) {
|
||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa(), use_thp));
|
||||
mmaps_used.emplace_back(mapping->size, 0);
|
||||
mmaps_used.emplace_back(mapping->size(), 0);
|
||||
if (mlock_mmaps) {
|
||||
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
||||
mlock_mmap->init(mapping->addr);
|
||||
mlock_mmap->init(mapping->addr());
|
||||
mlock_mmaps->emplace_back(std::move(mlock_mmap));
|
||||
}
|
||||
mappings.emplace_back(std::move(mapping));
|
||||
@@ -801,9 +801,9 @@ void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void *
|
||||
GGML_ASSERT(!mappings.empty());
|
||||
const auto & mapping = mappings.at(idx);
|
||||
|
||||
*first = mapping->size;
|
||||
*first = mapping->size();
|
||||
*last = 0;
|
||||
*addr = mapping->addr;
|
||||
*addr = mapping->addr();
|
||||
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||
try {
|
||||
const auto * weight = get_weight(ggml_get_name(tensor));
|
||||
@@ -828,9 +828,9 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
|
||||
if (use_mmap) {
|
||||
const auto & mapping = mappings.at(w.idx);
|
||||
if (cur->data == nullptr) {
|
||||
cur->data = (uint8_t *)mapping->addr + w.offs;
|
||||
cur->data = (uint8_t *)mapping->addr() + w.offs;
|
||||
} else {
|
||||
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
||||
memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(cur->data != nullptr);
|
||||
@@ -916,7 +916,7 @@ bool llama_model_loader::load_all_data(
|
||||
if (bufs_mmap.count(weight->idx)) {
|
||||
buf_mmap = bufs_mmap.at(weight->idx);
|
||||
}
|
||||
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
||||
uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
|
||||
|
||||
if (check_tensors) {
|
||||
validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
||||
@@ -1021,7 +1021,7 @@ bool llama_model_loader::load_all_data(
|
||||
auto & mapping = mappings.at(idx);
|
||||
mapping->unmap_fragment(0, mmap_used.first);
|
||||
if (mmap_used.second != 0) {
|
||||
mapping->unmap_fragment(mmap_used.second, mapping->size);
|
||||
mapping->unmap_fragment(mmap_used.second, mapping->size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ struct llama_model_loader {
|
||||
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
||||
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||
|
||||
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
||||
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
|
||||
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20823,7 +20823,7 @@ static bool llama_state_load_file_internal(struct llama_context * ctx, const cha
|
||||
|
||||
// restore the context state
|
||||
{
|
||||
const size_t n_state_size_cur = file.size - file.tell();
|
||||
const size_t n_state_size_cur = file.size() - file.tell();
|
||||
|
||||
llama_data_read_file data_ctx(&file);
|
||||
const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
|
||||
@@ -20960,7 +20960,7 @@ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, con
|
||||
|
||||
// restore the context state
|
||||
{
|
||||
const size_t state_size = file.size - file.tell();
|
||||
const size_t state_size = file.size() - file.tell();
|
||||
llama_data_read_file data_ctx(&file);
|
||||
const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
|
||||
if (!nread) {
|
||||
|
||||
Reference in New Issue
Block a user