mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
With this ggml_mul_mat_ext, he hit PP-512 = 209 t/s (iq1_bn) and PP-512 = 246 t/s (iq2_bn) on the M2 Max CPU. On the Ryzen-7950X we are at PP-512 = 447 t/s (iq1_bn, 32 threads) and PP-512 = 530 t/s (iq2_bn, 16 threads).
30 lines
1.1 KiB
C
30 lines
1.1 KiB
C
#pragma once
|
|
#include <stdint.h>
|
|
#include <stdbool.h>
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
bool iqk_mul_mat_ext(int task_type, long Nx, long Ny, long ne00,
|
|
int typeA, const void * A, long strideA,
|
|
int typeB, const void * B, long strideB,
|
|
float * C, long stride_C, float alpha, float beta, int ith, int nth);
|
|
|
|
static bool iqk_mul_mat(int task_type, long Nx, long Ny, long ne00,
|
|
int typeA, const void * A, long strideA,
|
|
int typeB, const void * B, long strideB,
|
|
float * C, long stride_C, int ith, int nth) {
|
|
return iqk_mul_mat_ext(task_type, Nx, Ny, ne00, typeA, A, strideA, typeB, B, strideB, C, stride_C, 1.f, 0.f, ith, nth);
|
|
}
|
|
|
|
bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11,
|
|
int typeA, const void * A, long strideA,
|
|
int typeB, const void * B, long strideB,
|
|
float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth);
|
|
|
|
bool iqk_soft_max(int nc, const float * sp, float * dp, float * wp, const char * bias, float scale, float slope, bool bias_is_f16);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|