mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-20 06:18:59 +00:00
122 lines
3.4 KiB
C++
122 lines
3.4 KiB
C++
/**
|
|
* @Description :
|
|
* @Author : chenht2022
|
|
* @Date : 2024-07-16 10:43:18
|
|
* @Version : 1.0.0
|
|
* @LastEditors : chenht2022
|
|
* @LastEditTime : 2024-08-07 09:47:43
|
|
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
|
**/
|
|
#ifndef CPUINFER_CPUINFER_H
|
|
#define CPUINFER_CPUINFER_H
|
|
|
|
#include <atomic>
|
|
#include <condition_variable>
|
|
#include <functional>
|
|
#include <mutex>
|
|
#include <queue>
|
|
#include <thread>
|
|
#include <vector>
|
|
#ifdef KTRANSFORMERS_USE_CUDA
|
|
#include "vendors/cuda.h"
|
|
#elif KTRANSFORMERS_USE_MUSA
|
|
#include "vendors/musa.h"
|
|
#elif KTRANSFORMERS_USE_ROCM
|
|
#define __HIP_PLATFORM_AMD__
|
|
#include "vendors/hip.h"
|
|
#endif
|
|
|
|
#include "./vendors/vendor.h"
|
|
#include "llama.cpp/ggml-impl.h"
|
|
#include "task_queue.h"
|
|
#include "worker_pool.h"
|
|
|
|
class CPUInfer {
|
|
public:
|
|
CPUInfer(int thread_num) {
|
|
printf("CPUInfer[0x%lx]: Hello\n", (intptr_t)this);
|
|
backend_ = new WorkerPool(thread_num);
|
|
task_queue_ = new TaskQueue();
|
|
for (int i = 0; i < (1 << 16); ++i) {
|
|
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
|
|
}
|
|
}
|
|
CPUInfer(int thread_num, int numa_id) {
|
|
printf("CPUInfer[0x%lx]: Hello\n", (intptr_t)this);
|
|
backend_ = new WorkerPool(thread_num, numa_id);
|
|
task_queue_ = new TaskQueue();
|
|
for (int i = 0; i < (1 << 16); ++i) {
|
|
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
|
|
}
|
|
}
|
|
|
|
CPUInfer(WorkerPoolConfig config) {
|
|
printf("CPUInfer[0x%lx]: Hello\n", (intptr_t)this);
|
|
backend_ = new WorkerPool(config);
|
|
task_queue_ = new TaskQueue();
|
|
for (int i = 0; i < (1 << 16); ++i) {
|
|
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
|
|
}
|
|
}
|
|
|
|
~CPUInfer() {
|
|
printf("CPUInfer[0x%lx]: Goodbye\n", (intptr_t)this);
|
|
delete backend_;
|
|
delete task_queue_;
|
|
}
|
|
|
|
CPUInfer(const CPUInfer&) = delete;
|
|
CPUInfer& operator=(const CPUInfer&) = delete;
|
|
CPUInfer(CPUInfer&&) = delete;
|
|
CPUInfer& operator=(CPUInfer&&) = delete;
|
|
|
|
template <typename Func, typename Obj, typename... Args>
|
|
void enqueue(Func f, Obj* obj, Args... args) {
|
|
task_queue_->enqueue([=]() { std::invoke(f, *obj, args...); });
|
|
}
|
|
|
|
void submit(std::pair<intptr_t, intptr_t> params) {
|
|
void (*func)(void*) = (void (*)(void*))params.first;
|
|
void* args = (void*)params.second;
|
|
*((CPUInfer**)args) = this;
|
|
func(args);
|
|
}
|
|
#ifndef KTRANSFORMERS_CPU_ONLY
|
|
void submit_with_cuda_stream(intptr_t user_cuda_stream, std::pair<intptr_t, intptr_t> params) {
|
|
#if defined(KTRANSFORMERS_USE_CUDA)
|
|
void (*func)(void*) = (void (*)(void*))params.first;
|
|
void* args = (void*)params.second;
|
|
*((CPUInfer**)args) = this;
|
|
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)func, args);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
struct SyncArgs {
|
|
CPUInfer* cpuinfer;
|
|
size_t allow_n_pending;
|
|
};
|
|
|
|
static void sync_(void* sync_args) {
|
|
SyncArgs* args = (SyncArgs*)sync_args;
|
|
args->cpuinfer->task_queue_->sync(args->allow_n_pending);
|
|
}
|
|
|
|
void sync(size_t allow_n_pending = 0) {
|
|
SyncArgs* args = new SyncArgs{this, allow_n_pending};
|
|
sync_(args);
|
|
}
|
|
#ifndef KTRANSFORMERS_CPU_ONLY
|
|
void sync_with_cuda_stream(intptr_t user_cuda_stream, size_t allow_n_pending = 0) {
|
|
#if defined(KTRANSFORMERS_USE_CUDA)
|
|
SyncArgs* args = new SyncArgs{this, allow_n_pending};
|
|
cudaLaunchHostFunc((cudaStream_t)user_cuda_stream, (cudaHostFn_t)&sync_, (void*)args);
|
|
#endif
|
|
}
|
|
#endif
|
|
public:
|
|
WorkerPool* backend_;
|
|
TaskQueue* task_queue_;
|
|
};
|
|
|
|
#endif |