Align strides of ct in macrokernels to that of c.

Details:
- Previously, rs_ct and cs_ct, the strides of the temporary microtile used
  primarily in the macrokernels' edge case handling, were unconditionally
  set to 1 and MR, respectively. However, Devin Matthews noted that this
  ought to be changed so that the strides of ct were in agreement with the
  strides of C. (That is, if C was row-stored, then ct should be accessed
  as by rows as well.) The implicit assumption is that the strides of C
  have already been adjusted, via induced transposition, if the storage
  preference of the microkernel is at odds with the storage of C. So, if
  the microkernel prefers row storage, the macrokernel's interior cases
  would present row-stored (ideal) microkernel subproblems to the
  microkernel, but for edge cases, it would still see column-stored
  subproblems (not ideal). This commit fixes this issue. Thanks to Devin
  for his suggestion.
This commit is contained in:
Field G. Van Zee
2016-10-31 14:40:51 -05:00
parent 6303910023
commit 618f4331eb
11 changed files with 66 additions and 33 deletions

View File

@@ -165,12 +165,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \

View File

@@ -170,12 +170,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \

View File

@@ -170,12 +170,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \

View File

@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \

View File

@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \

View File

@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \

View File

@@ -162,12 +162,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \

View File

@@ -166,12 +166,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \

View File

@@ -166,12 +166,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \

View File

@@ -171,12 +171,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \

View File

@@ -171,12 +171,15 @@ void PASTEMAC(ch,varname) \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = MR; \
const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \
const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \