mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Align strides of ct in macrokernels to that of c.
Details: - Previously, rs_ct and cs_ct, the strides of the temporary microtile used primarily in the macrokernels' edge case handling, were unconditionally set to 1 and MR, respectively. However, Devin Matthews noted that this ought to be changed so that the strides of ct were in agreement with the strides of C. (That is, if C was row-stored, then ct should be accessed as by rows as well.) The implicit assumption is that the strides of C have already been adjusted, via induced transposition, if the storage preference of the microkernel is at odds with the storage of C. So, if the microkernel prefers row storage, the macrokernel's interior cases would present row-stored (ideal) microkernel subproblems to the microkernel, but for edge cases, it would still see column-stored subproblems (not ideal). This commit fixes this issue. Thanks to Devin for his suggestion.
This commit is contained in:
@@ -165,12 +165,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
|
||||
@@ -170,12 +170,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
|
||||
@@ -170,12 +170,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
|
||||
@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
|
||||
@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
|
||||
@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
|
||||
@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
|
||||
@@ -166,12 +166,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
|
||||
@@ -166,12 +166,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
|
||||
@@ -171,12 +171,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
|
||||
@@ -171,12 +171,15 @@ void PASTEMAC(ch,varname) \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = MR; \
|
||||
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
|
||||
Reference in New Issue
Block a user