mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
cuBLAS: refactor and optimize f16 mat mul performance (#1259)
* cuBLAS: refactor, convert fp16 to fp32 on device * cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16 * fix build * cuBLAS: update block_q5_1
This commit is contained in:
11
ggml.h
11
ggml.h
@@ -197,6 +197,14 @@
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -212,6 +220,9 @@ extern "C" {
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
|
||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user