mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Retired portions of bli_kernel_3m/4m_macro_defs.h.
Details: - Removed sections of bli_kernel_[4m|3m]_macro_defs.h that defined 4m/3m-specific blocksizes after realizing that this can be done in bli_gemm[4m|3m]_cntl.c, since that is (mostly) the only place they are used. - The maximum cache values for 4m/3m are stll needed when computing mem pool dimensions in bli_mem_pool_macro_defs.h. As a workaround, "local" definitions in terms of the regular cache blocksizes are now in place. - Similarly, the register blocksizes for 4m/3m are still needed in bli_kernel_post_macro_defs.h. As a workaround, "local" definitions in terms of the regular register blocksizes are now in place.
This commit is contained in:
@@ -59,42 +59,51 @@ gemm_t* gemm3m_cntl;
|
||||
void bli_gemm3m_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
// NOTE: the complex blocksizes for 3m are generally equal to their
|
||||
// corresponding real domain counterparts. However, we want to promote
|
||||
// similar cache footprints for the micro-panels of A and B (when
|
||||
// compared to executing in the real domain), and since the complex
|
||||
// micro-panels are three times as "fat" (due to storing real, imaginary
|
||||
// and real+imaginary parts), we reduce KC by a factor of 2 to
|
||||
// compensate. Ideally, we would reduce by a factor of 3, but that
|
||||
// could get messy vis-a-vis keeping KC a multiple of the register
|
||||
// blocksizes.
|
||||
gemm3m_mc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_3M_MC_C, BLIS_MAXIMUM_3M_MC_C,
|
||||
BLIS_DEFAULT_3M_MC_Z, BLIS_MAXIMUM_3M_MC_Z );
|
||||
BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S,
|
||||
BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D );
|
||||
gemm3m_nc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_3M_NC_C, BLIS_MAXIMUM_3M_NC_C,
|
||||
BLIS_DEFAULT_3M_NC_Z, BLIS_MAXIMUM_3M_NC_Z );
|
||||
BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S,
|
||||
BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D );
|
||||
gemm3m_kc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_3M_KC_C, BLIS_MAXIMUM_3M_KC_C,
|
||||
BLIS_DEFAULT_3M_KC_Z, BLIS_MAXIMUM_3M_KC_Z );
|
||||
BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2,
|
||||
BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 );
|
||||
gemm3m_mr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_3M_MR_C, BLIS_PACKDIM_3M_MR_C,
|
||||
BLIS_DEFAULT_3M_MR_Z, BLIS_PACKDIM_3M_MR_Z );
|
||||
BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S,
|
||||
BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D );
|
||||
gemm3m_nr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_3M_NR_C, BLIS_PACKDIM_3M_NR_C,
|
||||
BLIS_DEFAULT_3M_NR_Z, BLIS_PACKDIM_3M_NR_Z );
|
||||
BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S,
|
||||
BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D );
|
||||
gemm3m_kr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_3M_KR_C, BLIS_PACKDIM_3M_KR_C,
|
||||
BLIS_DEFAULT_3M_KR_Z, BLIS_PACKDIM_3M_KR_Z );
|
||||
BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S,
|
||||
BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D );
|
||||
|
||||
|
||||
// Attach the register blksz_t objects as sub-blocksizes to the cache
|
||||
|
||||
@@ -59,42 +59,48 @@ gemm_t* gemm4m_cntl;
|
||||
void bli_gemm4m_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
// NOTE: the complex blocksizes for 4m are generally equal to their
|
||||
// corresponding real domain counterparts. However, we want to promote
|
||||
// similar cache footprints for the micro-panels of A and B (when
|
||||
// compared to executing in the real domain), and since the complex
|
||||
// micro-panels are twice as "fat" (due to storing real and imaginary
|
||||
// parts), we reduce KC by a factor of 2 to compensate.
|
||||
gemm4m_mc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_4M_MC_C, BLIS_MAXIMUM_4M_MC_C,
|
||||
BLIS_DEFAULT_4M_MC_Z, BLIS_MAXIMUM_4M_MC_Z );
|
||||
BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S,
|
||||
BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D );
|
||||
gemm4m_nc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_4M_NC_C, BLIS_MAXIMUM_4M_NC_C,
|
||||
BLIS_DEFAULT_4M_NC_Z, BLIS_MAXIMUM_4M_NC_Z );
|
||||
BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S,
|
||||
BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D );
|
||||
gemm4m_kc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_4M_KC_C, BLIS_MAXIMUM_4M_KC_C,
|
||||
BLIS_DEFAULT_4M_KC_Z, BLIS_MAXIMUM_4M_KC_Z );
|
||||
BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2,
|
||||
BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 );
|
||||
gemm4m_mr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_4M_MR_C, BLIS_PACKDIM_4M_MR_C,
|
||||
BLIS_DEFAULT_4M_MR_Z, BLIS_PACKDIM_4M_MR_Z );
|
||||
BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S,
|
||||
BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D );
|
||||
gemm4m_nr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_4M_NR_C, BLIS_PACKDIM_4M_NR_C,
|
||||
BLIS_DEFAULT_4M_NR_Z, BLIS_PACKDIM_4M_NR_Z );
|
||||
BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S,
|
||||
BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D );
|
||||
gemm4m_kr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0,
|
||||
0, 0,
|
||||
BLIS_DEFAULT_4M_KR_C, BLIS_PACKDIM_4M_KR_C,
|
||||
BLIS_DEFAULT_4M_KR_Z, BLIS_PACKDIM_4M_KR_Z );
|
||||
BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S,
|
||||
BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D );
|
||||
|
||||
|
||||
// Attach the register blksz_t objects as sub-blocksizes to the cache
|
||||
|
||||
@@ -188,145 +188,4 @@
|
||||
|
||||
|
||||
|
||||
// -- Define default 3m-specific blocksize macros ------------------------------
|
||||
|
||||
// Define complex 3m register blocksizes in terms of blocksizes used for
|
||||
// real kernels.
|
||||
|
||||
// 3m register blocksizes
|
||||
#define BLIS_DEFAULT_3M_MR_C BLIS_DEFAULT_MR_S
|
||||
#define BLIS_DEFAULT_3M_KR_C BLIS_DEFAULT_KR_S
|
||||
#define BLIS_DEFAULT_3M_NR_C BLIS_DEFAULT_NR_S
|
||||
|
||||
#define BLIS_DEFAULT_3M_MR_Z BLIS_DEFAULT_MR_D
|
||||
#define BLIS_DEFAULT_3M_KR_Z BLIS_DEFAULT_KR_D
|
||||
#define BLIS_DEFAULT_3M_NR_Z BLIS_DEFAULT_NR_D
|
||||
|
||||
// 3m packing register blocksize
|
||||
#define BLIS_PACKDIM_3M_MR_C BLIS_PACKDIM_MR_S
|
||||
#define BLIS_PACKDIM_3M_KR_C BLIS_PACKDIM_KR_S
|
||||
#define BLIS_PACKDIM_3M_NR_C BLIS_PACKDIM_NR_S
|
||||
|
||||
#define BLIS_PACKDIM_3M_MR_Z BLIS_PACKDIM_MR_D
|
||||
#define BLIS_PACKDIM_3M_KR_Z BLIS_PACKDIM_KR_D
|
||||
#define BLIS_PACKDIM_3M_NR_Z BLIS_PACKDIM_NR_D
|
||||
|
||||
// Define complex 3m cache blocksizes in terms of blocksizes used for
|
||||
// real operations (if they have not yet already been defined).
|
||||
|
||||
// 3m cache blocksizes
|
||||
#ifndef BLIS_DEFAULT_3M_MC_C
|
||||
#define BLIS_DEFAULT_3M_MC_C BLIS_DEFAULT_MC_S
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_3M_KC_C
|
||||
#define BLIS_DEFAULT_3M_KC_C ((BLIS_DEFAULT_KC_S)/2)
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_3M_NC_C
|
||||
#define BLIS_DEFAULT_3M_NC_C BLIS_DEFAULT_NC_S
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_DEFAULT_3M_MC_Z
|
||||
#define BLIS_DEFAULT_3M_MC_Z BLIS_DEFAULT_MC_D
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_3M_KC_Z
|
||||
#define BLIS_DEFAULT_3M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_3M_NC_Z
|
||||
#define BLIS_DEFAULT_3M_NC_Z BLIS_DEFAULT_NC_D
|
||||
#endif
|
||||
|
||||
// 3m maximum cache blocksize
|
||||
#ifndef BLIS_MAXIMUM_3M_MC_C
|
||||
#define BLIS_MAXIMUM_3M_MC_C BLIS_MAXIMUM_MC_S
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_3M_KC_C
|
||||
#define BLIS_MAXIMUM_3M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_3M_NC_C
|
||||
#define BLIS_MAXIMUM_3M_NC_C BLIS_MAXIMUM_NC_S
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_MAXIMUM_3M_MC_Z
|
||||
#define BLIS_MAXIMUM_3M_MC_Z BLIS_MAXIMUM_MC_D
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_3M_KC_Z
|
||||
#define BLIS_MAXIMUM_3M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_3M_NC_Z
|
||||
#define BLIS_MAXIMUM_3M_NC_Z BLIS_MAXIMUM_NC_D
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// -- Kernel blocksize checks --------------------------------------------------
|
||||
|
||||
// Verify that cache blocksizes are whole multiples of register blocksizes.
|
||||
// Specifically, verify that:
|
||||
// - MC is a whole multiple of MR *AND* NR.
|
||||
// - NC is a whole multiple of NR *AND* MR.
|
||||
// - KC is a whole multiple of KR *AND* both MR, NR.
|
||||
// These constraints are enforced because it makes it easier to handle diagonals
|
||||
// in the macro-kernel implementations.
|
||||
|
||||
//
|
||||
// MC must be a whole multiple of MR and NR.
|
||||
//
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \
|
||||
)
|
||||
#error "MC (3m) must be multiple of MR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \
|
||||
)
|
||||
#error "MC (3m) must be multiple of NR for all datatypes."
|
||||
#endif
|
||||
|
||||
//
|
||||
// NC must be a whole multiple of NR and MR.
|
||||
//
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \
|
||||
)
|
||||
#error "NC (3m) must be multiple of NR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \
|
||||
)
|
||||
#error "NC (3m) must be multiple of MR for all datatypes."
|
||||
#endif
|
||||
|
||||
//
|
||||
// KC must be a whole multiple of KR, MR, and NR.
|
||||
//
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_KR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_KR_Z != 0 ) \
|
||||
)
|
||||
#error "KC (3m) must be multiple of KR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \
|
||||
)
|
||||
#error "KC (3m) must be multiple of MR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \
|
||||
)
|
||||
#error "KC (3m) must be multiple of NR for all datatypes."
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -188,147 +188,4 @@
|
||||
|
||||
|
||||
|
||||
// -- Define default 4m-specific blocksize macros ------------------------------
|
||||
|
||||
// Define complex 4m register blocksizes in terms of blocksizes used for
|
||||
// real kernels.
|
||||
|
||||
// 4m register blocksizes
|
||||
#define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S
|
||||
#define BLIS_DEFAULT_4M_KR_C BLIS_DEFAULT_KR_S
|
||||
#define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S
|
||||
|
||||
#define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D
|
||||
#define BLIS_DEFAULT_4M_KR_Z BLIS_DEFAULT_KR_D
|
||||
#define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D
|
||||
|
||||
// 4m packing register blocksizes
|
||||
#define BLIS_PACKDIM_4M_MR_C BLIS_PACKDIM_MR_S
|
||||
#define BLIS_PACKDIM_4M_KR_C BLIS_PACKDIM_KR_S
|
||||
#define BLIS_PACKDIM_4M_NR_C BLIS_PACKDIM_NR_S
|
||||
|
||||
#define BLIS_PACKDIM_4M_MR_Z BLIS_PACKDIM_MR_D
|
||||
#define BLIS_PACKDIM_4M_KR_Z BLIS_PACKDIM_KR_D
|
||||
#define BLIS_PACKDIM_4M_NR_Z BLIS_PACKDIM_NR_D
|
||||
|
||||
// Define complex 4m cache blocksizes in terms of blocksizes used for
|
||||
// real operations (if they have not yet already been defined).
|
||||
|
||||
// 4m cache blocksizes
|
||||
#ifndef BLIS_DEFAULT_4M_MC_C
|
||||
#define BLIS_DEFAULT_4M_MC_C BLIS_DEFAULT_MC_S
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_4M_KC_C
|
||||
#define BLIS_DEFAULT_4M_KC_C ((BLIS_DEFAULT_KC_S)/2)
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_4M_NC_C
|
||||
#define BLIS_DEFAULT_4M_NC_C BLIS_DEFAULT_NC_S
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_DEFAULT_4M_MC_Z
|
||||
#define BLIS_DEFAULT_4M_MC_Z BLIS_DEFAULT_MC_D
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_4M_KC_Z
|
||||
#define BLIS_DEFAULT_4M_KC_Z ((BLIS_DEFAULT_KC_D)/2)
|
||||
#endif
|
||||
#ifndef BLIS_DEFAULT_4M_NC_Z
|
||||
#define BLIS_DEFAULT_4M_NC_Z BLIS_DEFAULT_NC_D
|
||||
#endif
|
||||
|
||||
// 4m maximum cache blocksizes
|
||||
#ifndef BLIS_MAXIMUM_4M_MC_C
|
||||
#define BLIS_MAXIMUM_4M_MC_C BLIS_MAXIMUM_MC_S
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_4M_KC_C
|
||||
#define BLIS_MAXIMUM_4M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_4M_NC_C
|
||||
#define BLIS_MAXIMUM_4M_NC_C BLIS_MAXIMUM_NC_S
|
||||
#endif
|
||||
|
||||
#ifndef BLIS_MAXIMUM_4M_MC_Z
|
||||
#define BLIS_MAXIMUM_4M_MC_Z BLIS_MAXIMUM_MC_D
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_4M_KC_Z
|
||||
#define BLIS_MAXIMUM_4M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
|
||||
#endif
|
||||
#ifndef BLIS_MAXIMUM_4M_NC_Z
|
||||
#define BLIS_MAXIMUM_4M_NC_Z BLIS_MAXIMUM_NC_D
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// -- Kernel blocksize checks --------------------------------------------------
|
||||
|
||||
// Verify that cache blocksizes are whole multiples of register blocksizes.
|
||||
// Specifically, verify that:
|
||||
// - MC is a whole multiple of MR *AND* NR.
|
||||
// - NC is a whole multiple of NR *AND* MR.
|
||||
// - KC is a whole multiple of KR *AND* both MR, NR.
|
||||
// These constraints are enforced because it makes it easier to handle diagonals
|
||||
// in the macro-kernel implementations.
|
||||
|
||||
//
|
||||
// MC must be a whole multiple of MR and NR.
|
||||
//
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \
|
||||
)
|
||||
#error "MC (4m) must be multiple of MR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \
|
||||
)
|
||||
#error "MC (4m) must be multiple of NR for all datatypes."
|
||||
#endif
|
||||
|
||||
//
|
||||
// NC must be a whole multiple of NR and MR.
|
||||
//
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \
|
||||
)
|
||||
#error "NC (4m) must be multiple of NR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \
|
||||
)
|
||||
#error "NC (4m) must be multiple of MR for all datatypes."
|
||||
#endif
|
||||
|
||||
//
|
||||
// KC must be a whole multiple of KR, MR, and NR.
|
||||
//
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_KR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_KR_Z != 0 ) \
|
||||
)
|
||||
#error "KC (4m) must be multiple of KR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \
|
||||
)
|
||||
#error "KC (4m) must be multiple of MR for all datatypes."
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \
|
||||
)
|
||||
#error "KC (4m) must be multiple of NR for all datatypes."
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -284,6 +284,11 @@
|
||||
// prefer not to assume this, therefore, we always take the larger of the
|
||||
// two values.
|
||||
|
||||
#define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S
|
||||
#define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S
|
||||
#define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D
|
||||
#define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D
|
||||
|
||||
//
|
||||
// Find the largest register blocksize MR.
|
||||
//
|
||||
|
||||
@@ -46,7 +46,7 @@
|
||||
// the cache blocksize value of the datatype used to size the pool (e.g.
|
||||
// double) was not updated accordingly.
|
||||
|
||||
// First we compute possibly scaling factors for each datatype. These
|
||||
// First we compute possible scaling factors for each datatype. These
|
||||
// scaling factors actually take the form of numerator and denominator
|
||||
// since we want stay in integer arithmetic. The purpose of the scaling
|
||||
// factors is to increase the amount of space we reserve for the memory
|
||||
@@ -65,6 +65,32 @@
|
||||
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-kernels.
|
||||
// We cross-multiply so that the comparison can stay in integer arithmetic.
|
||||
|
||||
|
||||
//
|
||||
// Create "local" definitions for the 4m and 3m maximum cache blocksizes
|
||||
// so that we can more easily show the computation of the pool dimensions
|
||||
// below.
|
||||
//
|
||||
|
||||
// 4m maximum cache blocksizes
|
||||
#define BLIS_MAXIMUM_4M_MC_C BLIS_MAXIMUM_MC_S
|
||||
#define BLIS_MAXIMUM_4M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
|
||||
#define BLIS_MAXIMUM_4M_NC_C BLIS_MAXIMUM_NC_S
|
||||
|
||||
#define BLIS_MAXIMUM_4M_MC_Z BLIS_MAXIMUM_MC_D
|
||||
#define BLIS_MAXIMUM_4M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
|
||||
#define BLIS_MAXIMUM_4M_NC_Z BLIS_MAXIMUM_NC_D
|
||||
|
||||
// 3m maximum cache blocksizes
|
||||
#define BLIS_MAXIMUM_3M_MC_C BLIS_MAXIMUM_MC_S
|
||||
#define BLIS_MAXIMUM_3M_KC_C ((BLIS_MAXIMUM_KC_S)/2)
|
||||
#define BLIS_MAXIMUM_3M_NC_C BLIS_MAXIMUM_NC_S
|
||||
|
||||
#define BLIS_MAXIMUM_3M_MC_Z BLIS_MAXIMUM_MC_D
|
||||
#define BLIS_MAXIMUM_3M_KC_Z ((BLIS_MAXIMUM_KC_D)/2)
|
||||
#define BLIS_MAXIMUM_3M_NC_Z BLIS_MAXIMUM_NC_D
|
||||
|
||||
|
||||
//
|
||||
// Compute scaling factors for single real.
|
||||
//
|
||||
@@ -200,12 +226,6 @@
|
||||
// Now, we compute the size of each block/panel of A, B, and C for each
|
||||
// datatype.
|
||||
|
||||
// NOTE: In defining each BLIS_*_BLOCK_SIZE_? macro below, we assume the
|
||||
// "worst case" of the register blocking being unit, in which case every row
|
||||
// of A and column of B would need padding to allow for alignment of every
|
||||
// packed micro-panel. (This is the worst case since for MR,NR > 1, padding
|
||||
// is only needed for every few rows of A and columns of B.)
|
||||
|
||||
//
|
||||
// Compute memory pool block sizes for single real.
|
||||
//
|
||||
@@ -361,8 +381,7 @@
|
||||
|
||||
// -- Maximum block size search ------------------------------------------------
|
||||
|
||||
// In this section, we find the largest of each block size and save the result
|
||||
// in a new macro for later use in bli_mem.c.
|
||||
// In this section, we find the largest of each block size.
|
||||
|
||||
//
|
||||
// Find the largest block size for blocks of A.
|
||||
@@ -468,6 +487,8 @@
|
||||
|
||||
|
||||
// Define each pool's total size using the block sizes determined above.
|
||||
// These values are used in bli_mem.c to size the static memory pool
|
||||
// arrays.
|
||||
|
||||
//
|
||||
// Pool for MC x KC blocks of A.
|
||||
|
||||
Reference in New Issue
Block a user