mirror of
https://github.com/amd/blis.git
synced 2026-04-30 12:31:11 +00:00
- Add a testsuite for gathering performance (in GFLOPs) and measuring correctness for the POWER10 GEMM reduced precision/integer kernels. - Reworked GENERIC_GEMM template to hardcode the cache parameters. - Remove kernel wrapper that checked that only allowed matrices that weren't transposed or conjugated. However, the kernels still assume the matrices are not transposed. This wrapper was removed for performance reasons. - Renamed and restructured files and functions for clarity. - Editted the POWER10 document to reflect new changes.
177 lines
5.1 KiB
C
177 lines
5.1 KiB
C
// templates for generating correctness checking functions that check the correctness of GEMM kernels
|
|
// using the BLIS GEMM correctness method
|
|
|
|
#define COR_KERNEL_NAME_(ch) ch ## correctness_kernel
|
|
#define COR_KERNEL_NAME(ch) COR_KERNEL_NAME_(ch)
|
|
|
|
|
|
// correctness template for float types
|
|
#define GEN_FP_COR_KERNEL(ch, kernel, input_t, DOWN_CAST, UP_CAST) \
|
|
void COR_KERNEL_NAME(ch) (int m, int n, int k) \
|
|
{ \
|
|
int rsa = k, csa = 1, \
|
|
rsb = n, csb = 1, \
|
|
rsc = n, csc = 1; \
|
|
\
|
|
input_t *a, *b; \
|
|
\
|
|
float *a_float, *b_float, \
|
|
*c_ans_float, *c_orig_float, \
|
|
alpha, beta; \
|
|
\
|
|
/* buffers that will be passed into the kernel */ \
|
|
a = (input_t *) malloc (m * k * sizeof(input_t)); \
|
|
b = (input_t *) malloc (k * n * sizeof(input_t)); \
|
|
\
|
|
/* std format buffers that will be used by the correctness checker */ \
|
|
a_float = (float *) malloc (m * k * sizeof(float)); \
|
|
b_float = (float *) malloc (k * n * sizeof(float)); \
|
|
c_ans_float = (float *) malloc (m * n * sizeof(float)); \
|
|
c_orig_float = (float *) malloc (m * n * sizeof(float)); \
|
|
\
|
|
/* randomize matrices with float vals */ \
|
|
bli_srandv(m*k, a_float, 1); \
|
|
bli_srandv(k*n, b_float, 1); \
|
|
bli_srandv(m*n, c_orig_float, 1); \
|
|
\
|
|
/* normalize the matrices */ \
|
|
normalize_vec(a_float, m*k); \
|
|
normalize_vec(b_float, k*n); \
|
|
normalize_vec(c_orig_float, m*n); \
|
|
\
|
|
/* cast the float buffers into the buffers for the kernel */ \
|
|
DOWN_CAST (a_float, a, m*k); \
|
|
DOWN_CAST (b_float, b, k*n); \
|
|
\
|
|
/* cast the kernel buffers into the float buffers to ensure that the values match */ \
|
|
UP_CAST (a, a_float, m*k); \
|
|
UP_CAST (b, b_float, k*n); \
|
|
\
|
|
/* init alpha and beta */ \
|
|
alpha = 1; \
|
|
beta = 1; \
|
|
\
|
|
memcpy(c_ans_float, c_orig_float, m * n * sizeof(float)); \
|
|
kernel( \
|
|
BLIS_NO_TRANSPOSE, \
|
|
BLIS_NO_TRANSPOSE, \
|
|
m, \
|
|
n, \
|
|
k, \
|
|
&alpha, \
|
|
a, rsa, csa, \
|
|
b, rsb, csb, \
|
|
&beta, \
|
|
c_ans_float, rsc, csc \
|
|
); \
|
|
\
|
|
correctness_checker( \
|
|
m, n, k, \
|
|
a_float, rsa, csa, \
|
|
b_float, rsb, csb, \
|
|
c_orig_float, rsc, csc, \
|
|
c_ans_float, \
|
|
alpha, beta \
|
|
); \
|
|
\
|
|
free(a); \
|
|
free(b); \
|
|
free(a_float); \
|
|
free(b_float); \
|
|
free(c_ans_float); \
|
|
free(c_orig_float); \
|
|
\
|
|
}
|
|
|
|
// correctness template for int types
|
|
#define GEN_I_COR_KERNEL(ch, kernel, input_t, DOWN_CAST, UP_CAST) \
|
|
void COR_KERNEL_NAME(ch) (int m, int n, int k) \
|
|
{ \
|
|
int rsa = k, csa = 1, \
|
|
rsb = n, csb = 1, \
|
|
rsc = n, csc = 1; \
|
|
\
|
|
input_t *a, *b; \
|
|
\
|
|
int32_t *c_ans, *c_orig, alpha, beta; \
|
|
\
|
|
float *a_float, *b_float, \
|
|
*c_ans_float, *c_orig_float; \
|
|
\
|
|
/* buffers that will be passed into the kernel */ \
|
|
a = (input_t *) malloc (m * k * sizeof(input_t)); \
|
|
b = (input_t *) malloc (k * n * sizeof(input_t)); \
|
|
c_ans = (int32_t *) malloc (m * n * sizeof(int32_t)); \
|
|
c_orig = (int32_t *) malloc (m * n * sizeof(int32_t)); \
|
|
\
|
|
/* std format buffers that will be used by the correctness checker */ \
|
|
a_float = (float *) malloc (m * k * sizeof(float)); \
|
|
b_float = (float *) malloc (k * n * sizeof(float)); \
|
|
c_ans_float = (float *) malloc (m * n * sizeof(float)); \
|
|
c_orig_float = (float *) malloc (m * n * sizeof(float)); \
|
|
\
|
|
/* randomize matrices with float vals */ \
|
|
bli_srandv(m*k, a_float, 1); \
|
|
bli_srandv(k*n, b_float, 1); \
|
|
bli_srandv(m*n, c_orig_float, 1); \
|
|
\
|
|
/* normalize the matrices */ \
|
|
normalize_vec(a_float, m*k); \
|
|
normalize_vec(b_float, k*n); \
|
|
normalize_vec(c_orig_float, m*n); \
|
|
\
|
|
/* cast the float buffers into the buffers for the kernel */ \
|
|
DOWN_CAST (a_float, a, m*k); \
|
|
DOWN_CAST (b_float, b, k*n); \
|
|
\
|
|
/* cast float buffers to support int values */ \
|
|
cast_f32_to_i32m(c_orig_float, c_orig, m*n); \
|
|
cast_i32_to_f32m(c_orig, c_orig_float, m*n); \
|
|
\
|
|
/* cast the kernel buffers into the float buffers to ensure that the values match */ \
|
|
UP_CAST (a, a_float, m*k); \
|
|
UP_CAST (b, b_float, k*n); \
|
|
\
|
|
/* init alpha and beta */ \
|
|
alpha = 1; \
|
|
beta = 1; \
|
|
\
|
|
/* run kernel to get result in c_ans */ \
|
|
memcpy(c_ans, c_orig, m * n * sizeof(int)); \
|
|
kernel( \
|
|
BLIS_NO_TRANSPOSE, \
|
|
BLIS_NO_TRANSPOSE, \
|
|
m, \
|
|
n, \
|
|
k, \
|
|
&alpha, \
|
|
a, rsa, csa, \
|
|
b, rsb, csb, \
|
|
&beta, \
|
|
c_ans, rsc, csc \
|
|
); \
|
|
\
|
|
/* cast integer result into float buffer since float is our std format for correctness checking */ \
|
|
cast_i32_to_f32m(c_ans, c_ans_float, m*n); \
|
|
\
|
|
/* using the BLIS GEMM correctness check method, get the resid */ \
|
|
correctness_checker( \
|
|
m, n, k, \
|
|
a_float, rsa, csa, \
|
|
b_float, rsb, csb, \
|
|
c_orig_float, rsc, csc, \
|
|
c_ans_float, \
|
|
(float) alpha, (float) beta \
|
|
); \
|
|
\
|
|
free(a); \
|
|
free(b); \
|
|
free(c_ans); \
|
|
free(c_orig); \
|
|
free(a_float); \
|
|
free(b_float); \
|
|
free(c_ans_float); \
|
|
free(c_orig_float); \
|
|
\
|
|
}
|