Retired portions of bli_kernel_3m/4m_macro_defs.h.

Details:
- Removed sections of bli_kernel_[4m|3m]_macro_defs.h that defined
  4m/3m-specific blocksizes after realizing that this can be done in
  bli_gemm[4m|3m]_cntl.c, since that is (mostly) the only place they
  are used.
- The maximum cache values for 4m/3m are stll needed when computing mem
  pool dimensions in bli_mem_pool_macro_defs.h. As a workaround, "local"
  definitions in terms of the regular cache blocksizes are now in place.
- Similarly, the register blocksizes for 4m/3m are still needed in
  bli_kernel_post_macro_defs.h. As a workaround, "local" definitions in
  terms of the regular register blocksizes are now in place.
This commit is contained in:
Field G. Van Zee
2014-09-01 16:23:17 -05:00
parent af521ee6f2
commit 189def3667
6 changed files with 74 additions and 317 deletions

View File

@@ -59,42 +59,51 @@ gemm_t* gemm3m_cntl;
void bli_gemm3m_cntl_init()
{
// Create blocksize objects for each dimension.
// NOTE: the complex blocksizes for 3m are generally equal to their
// corresponding real domain counterparts. However, we want to promote
// similar cache footprints for the micro-panels of A and B (when
// compared to executing in the real domain), and since the complex
// micro-panels are three times as "fat" (due to storing real, imaginary
// and real+imaginary parts), we reduce KC by a factor of 2 to
// compensate. Ideally, we would reduce by a factor of 3, but that
// could get messy vis-a-vis keeping KC a multiple of the register
// blocksizes.
gemm3m_mc
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_3M_MC_C, BLIS_MAXIMUM_3M_MC_C,
BLIS_DEFAULT_3M_MC_Z, BLIS_MAXIMUM_3M_MC_Z );
BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S,
BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D );
gemm3m_nc
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_3M_NC_C, BLIS_MAXIMUM_3M_NC_C,
BLIS_DEFAULT_3M_NC_Z, BLIS_MAXIMUM_3M_NC_Z );
BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S,
BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D );
gemm3m_kc
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_3M_KC_C, BLIS_MAXIMUM_3M_KC_C,
BLIS_DEFAULT_3M_KC_Z, BLIS_MAXIMUM_3M_KC_Z );
BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2,
BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 );
gemm3m_mr
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_3M_MR_C, BLIS_PACKDIM_3M_MR_C,
BLIS_DEFAULT_3M_MR_Z, BLIS_PACKDIM_3M_MR_Z );
BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S,
BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D );
gemm3m_nr
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_3M_NR_C, BLIS_PACKDIM_3M_NR_C,
BLIS_DEFAULT_3M_NR_Z, BLIS_PACKDIM_3M_NR_Z );
BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S,
BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D );
gemm3m_kr
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_3M_KR_C, BLIS_PACKDIM_3M_KR_C,
BLIS_DEFAULT_3M_KR_Z, BLIS_PACKDIM_3M_KR_Z );
BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S,
BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D );
// Attach the register blksz_t objects as sub-blocksizes to the cache

View File

@@ -59,42 +59,48 @@ gemm_t* gemm4m_cntl;
void bli_gemm4m_cntl_init()
{
// Create blocksize objects for each dimension.
// NOTE: the complex blocksizes for 4m are generally equal to their
// corresponding real domain counterparts. However, we want to promote
// similar cache footprints for the micro-panels of A and B (when
// compared to executing in the real domain), and since the complex
// micro-panels are twice as "fat" (due to storing real and imaginary
// parts), we reduce KC by a factor of 2 to compensate.
gemm4m_mc
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_4M_MC_C, BLIS_MAXIMUM_4M_MC_C,
BLIS_DEFAULT_4M_MC_Z, BLIS_MAXIMUM_4M_MC_Z );
BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S,
BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D );
gemm4m_nc
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_4M_NC_C, BLIS_MAXIMUM_4M_NC_C,
BLIS_DEFAULT_4M_NC_Z, BLIS_MAXIMUM_4M_NC_Z );
BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S,
BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D );
gemm4m_kc
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_4M_KC_C, BLIS_MAXIMUM_4M_KC_C,
BLIS_DEFAULT_4M_KC_Z, BLIS_MAXIMUM_4M_KC_Z );
BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2,
BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 );
gemm4m_mr
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_4M_MR_C, BLIS_PACKDIM_4M_MR_C,
BLIS_DEFAULT_4M_MR_Z, BLIS_PACKDIM_4M_MR_Z );
BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S,
BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D );
gemm4m_nr
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_4M_NR_C, BLIS_PACKDIM_4M_NR_C,
BLIS_DEFAULT_4M_NR_Z, BLIS_PACKDIM_4M_NR_Z );
BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S,
BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D );
gemm4m_kr
=
bli_blksz_obj_create( 0, 0,
0, 0,
BLIS_DEFAULT_4M_KR_C, BLIS_PACKDIM_4M_KR_C,
BLIS_DEFAULT_4M_KR_Z, BLIS_PACKDIM_4M_KR_Z );
BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S,
BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D );
// Attach the register blksz_t objects as sub-blocksizes to the cache

View File

@@ -188,145 +188,4 @@
// -- Define default 3m-specific blocksize macros ------------------------------
// Define complex 3m register blocksizes in terms of blocksizes used for
// real kernels.
// 3m register blocksizes
#define BLIS_DEFAULT_3M_MR_C BLIS_DEFAULT_MR_S
#define BLIS_DEFAULT_3M_KR_C BLIS_DEFAULT_KR_S
#define BLIS_DEFAULT_3M_NR_C BLIS_DEFAULT_NR_S
#define BLIS_DEFAULT_3M_MR_Z BLIS_DEFAULT_MR_D
#define BLIS_DEFAULT_3M_KR_Z BLIS_DEFAULT_KR_D
#define BLIS_DEFAULT_3M_NR_Z BLIS_DEFAULT_NR_D
// 3m packing register blocksize
#define BLIS_PACKDIM_3M_MR_C BLIS_PACKDIM_MR_S
#define BLIS_PACKDIM_3M_KR_C BLIS_PACKDIM_KR_S
#define BLIS_PACKDIM_3M_NR_C BLIS_PACKDIM_NR_S
#define BLIS_PACKDIM_3M_MR_Z BLIS_PACKDIM_MR_D
#define BLIS_PACKDIM_3M_KR_Z BLIS_PACKDIM_KR_D
#define BLIS_PACKDIM_3M_NR_Z BLIS_PACKDIM_NR_D
// Define complex 3m cache blocksizes in terms of blocksizes used for
// real operations (if they have not yet already been defined).
// 3m cache blocksizes
#ifndef BLIS_DEFAULT_3M_MC_C
#define BLIS_DEFAULT_3M_MC_C BLIS_DEFAULT_MC_S
#endif
#ifndef BLIS_DEFAULT_3M_KC_C
#define BLIS_DEFAULT_3M_KC_C ((BLIS_DEFAULT_KC_S)/2)
#endif
#ifndef BLIS_DEFAULT_3M_NC_C
#define BLIS_DEFAULT_3M_NC_C BLIS_DEFAULT_NC_S
#endif
#ifndef BLIS_DEFAULT_3M_MC_Z
#define BLIS_DEFAULT_3M_MC_Z BLIS_DEFAULT_MC_D
#endif
#ifndef BLIS_DEFAULT_3M_KC_Z
#define BLIS_DEFAULT_3M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
#endif
#ifndef BLIS_DEFAULT_3M_NC_Z
#define BLIS_DEFAULT_3M_NC_Z BLIS_DEFAULT_NC_D
#endif
// 3m maximum cache blocksize
#ifndef BLIS_MAXIMUM_3M_MC_C
#define BLIS_MAXIMUM_3M_MC_C BLIS_MAXIMUM_MC_S
#endif
#ifndef BLIS_MAXIMUM_3M_KC_C
#define BLIS_MAXIMUM_3M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
#endif
#ifndef BLIS_MAXIMUM_3M_NC_C
#define BLIS_MAXIMUM_3M_NC_C BLIS_MAXIMUM_NC_S
#endif
#ifndef BLIS_MAXIMUM_3M_MC_Z
#define BLIS_MAXIMUM_3M_MC_Z BLIS_MAXIMUM_MC_D
#endif
#ifndef BLIS_MAXIMUM_3M_KC_Z
#define BLIS_MAXIMUM_3M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
#endif
#ifndef BLIS_MAXIMUM_3M_NC_Z
#define BLIS_MAXIMUM_3M_NC_Z BLIS_MAXIMUM_NC_D
#endif
// -- Kernel blocksize checks --------------------------------------------------
// Verify that cache blocksizes are whole multiples of register blocksizes.
// Specifically, verify that:
// - MC is a whole multiple of MR *AND* NR.
// - NC is a whole multiple of NR *AND* MR.
// - KC is a whole multiple of KR *AND* both MR, NR.
// These constraints are enforced because it makes it easier to handle diagonals
// in the macro-kernel implementations.
//
// MC must be a whole multiple of MR and NR.
//
#if ( \
( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \
( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \
)
#error "MC (3m) must be multiple of MR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \
( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \
)
#error "MC (3m) must be multiple of NR for all datatypes."
#endif
//
// NC must be a whole multiple of NR and MR.
//
#if ( \
( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \
( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \
)
#error "NC (3m) must be multiple of NR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \
( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \
)
#error "NC (3m) must be multiple of MR for all datatypes."
#endif
//
// KC must be a whole multiple of KR, MR, and NR.
//
#if ( \
( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_KR_C != 0 ) || \
( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_KR_Z != 0 ) \
)
#error "KC (3m) must be multiple of KR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \
( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \
)
#error "KC (3m) must be multiple of MR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \
( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \
)
#error "KC (3m) must be multiple of NR for all datatypes."
#endif
#endif

View File

@@ -188,147 +188,4 @@
// -- Define default 4m-specific blocksize macros ------------------------------
// Define complex 4m register blocksizes in terms of blocksizes used for
// real kernels.
// 4m register blocksizes
#define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S
#define BLIS_DEFAULT_4M_KR_C BLIS_DEFAULT_KR_S
#define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S
#define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D
#define BLIS_DEFAULT_4M_KR_Z BLIS_DEFAULT_KR_D
#define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D
// 4m packing register blocksizes
#define BLIS_PACKDIM_4M_MR_C BLIS_PACKDIM_MR_S
#define BLIS_PACKDIM_4M_KR_C BLIS_PACKDIM_KR_S
#define BLIS_PACKDIM_4M_NR_C BLIS_PACKDIM_NR_S
#define BLIS_PACKDIM_4M_MR_Z BLIS_PACKDIM_MR_D
#define BLIS_PACKDIM_4M_KR_Z BLIS_PACKDIM_KR_D
#define BLIS_PACKDIM_4M_NR_Z BLIS_PACKDIM_NR_D
// Define complex 4m cache blocksizes in terms of blocksizes used for
// real operations (if they have not yet already been defined).
// 4m cache blocksizes
#ifndef BLIS_DEFAULT_4M_MC_C
#define BLIS_DEFAULT_4M_MC_C BLIS_DEFAULT_MC_S
#endif
#ifndef BLIS_DEFAULT_4M_KC_C
#define BLIS_DEFAULT_4M_KC_C ((BLIS_DEFAULT_KC_S)/2)
#endif
#ifndef BLIS_DEFAULT_4M_NC_C
#define BLIS_DEFAULT_4M_NC_C BLIS_DEFAULT_NC_S
#endif
#ifndef BLIS_DEFAULT_4M_MC_Z
#define BLIS_DEFAULT_4M_MC_Z BLIS_DEFAULT_MC_D
#endif
#ifndef BLIS_DEFAULT_4M_KC_Z
#define BLIS_DEFAULT_4M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
#endif
#ifndef BLIS_DEFAULT_4M_NC_Z
#define BLIS_DEFAULT_4M_NC_Z BLIS_DEFAULT_NC_D
#endif
// 4m maximum cache blocksizes
#ifndef BLIS_MAXIMUM_4M_MC_C
#define BLIS_MAXIMUM_4M_MC_C BLIS_MAXIMUM_MC_S
#endif
#ifndef BLIS_MAXIMUM_4M_KC_C
#define BLIS_MAXIMUM_4M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
#endif
#ifndef BLIS_MAXIMUM_4M_NC_C
#define BLIS_MAXIMUM_4M_NC_C BLIS_MAXIMUM_NC_S
#endif
#ifndef BLIS_MAXIMUM_4M_MC_Z
#define BLIS_MAXIMUM_4M_MC_Z BLIS_MAXIMUM_MC_D
#endif
#ifndef BLIS_MAXIMUM_4M_KC_Z
#define BLIS_MAXIMUM_4M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
#endif
#ifndef BLIS_MAXIMUM_4M_NC_Z
#define BLIS_MAXIMUM_4M_NC_Z BLIS_MAXIMUM_NC_D
#endif
// -- Kernel blocksize checks --------------------------------------------------
// Verify that cache blocksizes are whole multiples of register blocksizes.
// Specifically, verify that:
// - MC is a whole multiple of MR *AND* NR.
// - NC is a whole multiple of NR *AND* MR.
// - KC is a whole multiple of KR *AND* both MR, NR.
// These constraints are enforced because it makes it easier to handle diagonals
// in the macro-kernel implementations.
//
// MC must be a whole multiple of MR and NR.
//
#if ( \
( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \
( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \
)
#error "MC (4m) must be multiple of MR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \
( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \
)
#error "MC (4m) must be multiple of NR for all datatypes."
#endif
//
// NC must be a whole multiple of NR and MR.
//
#if ( \
( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \
( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \
)
#error "NC (4m) must be multiple of NR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \
( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \
)
#error "NC (4m) must be multiple of MR for all datatypes."
#endif
//
// KC must be a whole multiple of KR, MR, and NR.
//
#if ( \
( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_KR_C != 0 ) || \
( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_KR_Z != 0 ) \
)
#error "KC (4m) must be multiple of KR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \
( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \
)
#error "KC (4m) must be multiple of MR for all datatypes."
#endif
#if ( \
( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \
( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \
)
#error "KC (4m) must be multiple of NR for all datatypes."
#endif
#endif

View File

@@ -284,6 +284,11 @@
// prefer not to assume this, therefore, we always take the larger of the
// two values.
#define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S
#define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S
#define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D
#define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D
//
// Find the largest register blocksize MR.
//

View File

@@ -46,7 +46,7 @@
// the cache blocksize value of the datatype used to size the pool (e.g.
// double) was not updated accordingly.
// First we compute possibly scaling factors for each datatype. These
// First we compute possible scaling factors for each datatype. These
// scaling factors actually take the form of numerator and denominator
// since we want stay in integer arithmetic. The purpose of the scaling
// factors is to increase the amount of space we reserve for the memory
@@ -65,6 +65,32 @@
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-kernels.
// We cross-multiply so that the comparison can stay in integer arithmetic.
//
// Create "local" definitions for the 4m and 3m maximum cache blocksizes
// so that we can more easily show the computation of the pool dimensions
// below.
//
// 4m maximum cache blocksizes
#define BLIS_MAXIMUM_4M_MC_C BLIS_MAXIMUM_MC_S
#define BLIS_MAXIMUM_4M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
#define BLIS_MAXIMUM_4M_NC_C BLIS_MAXIMUM_NC_S
#define BLIS_MAXIMUM_4M_MC_Z BLIS_MAXIMUM_MC_D
#define BLIS_MAXIMUM_4M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
#define BLIS_MAXIMUM_4M_NC_Z BLIS_MAXIMUM_NC_D
// 3m maximum cache blocksizes
#define BLIS_MAXIMUM_3M_MC_C BLIS_MAXIMUM_MC_S
#define BLIS_MAXIMUM_3M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
#define BLIS_MAXIMUM_3M_NC_C BLIS_MAXIMUM_NC_S
#define BLIS_MAXIMUM_3M_MC_Z BLIS_MAXIMUM_MC_D
#define BLIS_MAXIMUM_3M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
#define BLIS_MAXIMUM_3M_NC_Z BLIS_MAXIMUM_NC_D
//
// Compute scaling factors for single real.
//
@@ -200,12 +226,6 @@
// Now, we compute the size of each block/panel of A, B, and C for each
// datatype.
// NOTE: In defining each BLIS_*_BLOCK_SIZE_? macro below, we assume the
// "worst case" of the register blocking being unit, in which case every row
// of A and column of B would need padding to allow for alignment of every
// packed micro-panel. (This is the worst case since for MR,NR > 1, padding
// is only needed for every few rows of A and columns of B.)
//
// Compute memory pool block sizes for single real.
//
@@ -361,8 +381,7 @@
// -- Maximum block size search ------------------------------------------------
// In this section, we find the largest of each block size and save the result
// in a new macro for later use in bli_mem.c.
// In this section, we find the largest of each block size.
//
// Find the largest block size for blocks of A.
@@ -468,6 +487,8 @@
// Define each pool's total size using the block sizes determined above.
// These values are used in bli_mem.c to size the static memory pool
// arrays.
//
// Pool for MC x KC blocks of A.