From 189def3667d9218adbeec45e2801fd074341a679 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 1 Sep 2014 16:23:17 -0500 Subject: [PATCH] Retired portions of bli_kernel_3m/4m_macro_defs.h. Details: - Removed sections of bli_kernel_[4m|3m]_macro_defs.h that defined 4m/3m-specific blocksizes after realizing that this can be done in bli_gemm[4m|3m]_cntl.c, since that is (mostly) the only place they are used. - The maximum cache values for 4m/3m are stll needed when computing mem pool dimensions in bli_mem_pool_macro_defs.h. As a workaround, "local" definitions in terms of the regular cache blocksizes are now in place. - Similarly, the register blocksizes for 4m/3m are still needed in bli_kernel_post_macro_defs.h. As a workaround, "local" definitions in terms of the regular register blocksizes are now in place. --- frame/3/gemm/3m/bli_gemm3m_cntl.c | 33 +++-- frame/3/gemm/4m/bli_gemm4m_cntl.c | 30 +++-- frame/include/bli_kernel_3m_macro_defs.h | 141 -------------------- frame/include/bli_kernel_4m_macro_defs.h | 143 --------------------- frame/include/bli_kernel_post_macro_defs.h | 5 + frame/include/bli_mem_pool_macro_defs.h | 39 ++++-- 6 files changed, 74 insertions(+), 317 deletions(-) diff --git a/frame/3/gemm/3m/bli_gemm3m_cntl.c b/frame/3/gemm/3m/bli_gemm3m_cntl.c index 2e6405a54..ab750630a 100644 --- a/frame/3/gemm/3m/bli_gemm3m_cntl.c +++ b/frame/3/gemm/3m/bli_gemm3m_cntl.c @@ -59,42 +59,51 @@ gemm_t* gemm3m_cntl; void bli_gemm3m_cntl_init() { // Create blocksize objects for each dimension. + // NOTE: the complex blocksizes for 3m are generally equal to their + // corresponding real domain counterparts. However, we want to promote + // similar cache footprints for the micro-panels of A and B (when + // compared to executing in the real domain), and since the complex + // micro-panels are three times as "fat" (due to storing real, imaginary + // and real+imaginary parts), we reduce KC by a factor of 2 to + // compensate. Ideally, we would reduce by a factor of 3, but that + // could get messy vis-a-vis keeping KC a multiple of the register + // blocksizes. gemm3m_mc = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_3M_MC_C, BLIS_MAXIMUM_3M_MC_C, - BLIS_DEFAULT_3M_MC_Z, BLIS_MAXIMUM_3M_MC_Z ); + BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, + BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm3m_nc = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_3M_NC_C, BLIS_MAXIMUM_3M_NC_C, - BLIS_DEFAULT_3M_NC_Z, BLIS_MAXIMUM_3M_NC_Z ); + BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, + BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm3m_kc = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_3M_KC_C, BLIS_MAXIMUM_3M_KC_C, - BLIS_DEFAULT_3M_KC_Z, BLIS_MAXIMUM_3M_KC_Z ); + BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, + BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm3m_mr = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_3M_MR_C, BLIS_PACKDIM_3M_MR_C, - BLIS_DEFAULT_3M_MR_Z, BLIS_PACKDIM_3M_MR_Z ); + BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, + BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm3m_nr = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_3M_NR_C, BLIS_PACKDIM_3M_NR_C, - BLIS_DEFAULT_3M_NR_Z, BLIS_PACKDIM_3M_NR_Z ); + BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, + BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm3m_kr = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_3M_KR_C, BLIS_PACKDIM_3M_KR_C, - BLIS_DEFAULT_3M_KR_Z, BLIS_PACKDIM_3M_KR_Z ); + BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, + BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as sub-blocksizes to the cache diff --git a/frame/3/gemm/4m/bli_gemm4m_cntl.c b/frame/3/gemm/4m/bli_gemm4m_cntl.c index 67706665f..54f769564 100644 --- a/frame/3/gemm/4m/bli_gemm4m_cntl.c +++ b/frame/3/gemm/4m/bli_gemm4m_cntl.c @@ -59,42 +59,48 @@ gemm_t* gemm4m_cntl; void bli_gemm4m_cntl_init() { // Create blocksize objects for each dimension. + // NOTE: the complex blocksizes for 4m are generally equal to their + // corresponding real domain counterparts. However, we want to promote + // similar cache footprints for the micro-panels of A and B (when + // compared to executing in the real domain), and since the complex + // micro-panels are twice as "fat" (due to storing real and imaginary + // parts), we reduce KC by a factor of 2 to compensate. gemm4m_mc = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_4M_MC_C, BLIS_MAXIMUM_4M_MC_C, - BLIS_DEFAULT_4M_MC_Z, BLIS_MAXIMUM_4M_MC_Z ); + BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, + BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4m_nc = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_4M_NC_C, BLIS_MAXIMUM_4M_NC_C, - BLIS_DEFAULT_4M_NC_Z, BLIS_MAXIMUM_4M_NC_Z ); + BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, + BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4m_kc = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_4M_KC_C, BLIS_MAXIMUM_4M_KC_C, - BLIS_DEFAULT_4M_KC_Z, BLIS_MAXIMUM_4M_KC_Z ); + BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, + BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm4m_mr = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_4M_MR_C, BLIS_PACKDIM_4M_MR_C, - BLIS_DEFAULT_4M_MR_Z, BLIS_PACKDIM_4M_MR_Z ); + BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, + BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4m_nr = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_4M_NR_C, BLIS_PACKDIM_4M_NR_C, - BLIS_DEFAULT_4M_NR_Z, BLIS_PACKDIM_4M_NR_Z ); + BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, + BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4m_kr = bli_blksz_obj_create( 0, 0, 0, 0, - BLIS_DEFAULT_4M_KR_C, BLIS_PACKDIM_4M_KR_C, - BLIS_DEFAULT_4M_KR_Z, BLIS_PACKDIM_4M_KR_Z ); + BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, + BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); // Attach the register blksz_t objects as sub-blocksizes to the cache diff --git a/frame/include/bli_kernel_3m_macro_defs.h b/frame/include/bli_kernel_3m_macro_defs.h index a9908f5a6..2b01e49e4 100644 --- a/frame/include/bli_kernel_3m_macro_defs.h +++ b/frame/include/bli_kernel_3m_macro_defs.h @@ -188,145 +188,4 @@ -// -- Define default 3m-specific blocksize macros ------------------------------ - -// Define complex 3m register blocksizes in terms of blocksizes used for -// real kernels. - -// 3m register blocksizes -#define BLIS_DEFAULT_3M_MR_C BLIS_DEFAULT_MR_S -#define BLIS_DEFAULT_3M_KR_C BLIS_DEFAULT_KR_S -#define BLIS_DEFAULT_3M_NR_C BLIS_DEFAULT_NR_S - -#define BLIS_DEFAULT_3M_MR_Z BLIS_DEFAULT_MR_D -#define BLIS_DEFAULT_3M_KR_Z BLIS_DEFAULT_KR_D -#define BLIS_DEFAULT_3M_NR_Z BLIS_DEFAULT_NR_D - -// 3m packing register blocksize -#define BLIS_PACKDIM_3M_MR_C BLIS_PACKDIM_MR_S -#define BLIS_PACKDIM_3M_KR_C BLIS_PACKDIM_KR_S -#define BLIS_PACKDIM_3M_NR_C BLIS_PACKDIM_NR_S - -#define BLIS_PACKDIM_3M_MR_Z BLIS_PACKDIM_MR_D -#define BLIS_PACKDIM_3M_KR_Z BLIS_PACKDIM_KR_D -#define BLIS_PACKDIM_3M_NR_Z BLIS_PACKDIM_NR_D - -// Define complex 3m cache blocksizes in terms of blocksizes used for -// real operations (if they have not yet already been defined). - -// 3m cache blocksizes -#ifndef BLIS_DEFAULT_3M_MC_C -#define BLIS_DEFAULT_3M_MC_C BLIS_DEFAULT_MC_S -#endif -#ifndef BLIS_DEFAULT_3M_KC_C -#define BLIS_DEFAULT_3M_KC_C ((BLIS_DEFAULT_KC_S)/2) -#endif -#ifndef BLIS_DEFAULT_3M_NC_C -#define BLIS_DEFAULT_3M_NC_C BLIS_DEFAULT_NC_S -#endif - -#ifndef BLIS_DEFAULT_3M_MC_Z -#define BLIS_DEFAULT_3M_MC_Z BLIS_DEFAULT_MC_D -#endif -#ifndef BLIS_DEFAULT_3M_KC_Z -#define BLIS_DEFAULT_3M_KC_Z ((BLIS_DEFAULT_KC_D)/2) -#endif -#ifndef BLIS_DEFAULT_3M_NC_Z -#define BLIS_DEFAULT_3M_NC_Z BLIS_DEFAULT_NC_D -#endif - -// 3m maximum cache blocksize -#ifndef BLIS_MAXIMUM_3M_MC_C -#define BLIS_MAXIMUM_3M_MC_C BLIS_MAXIMUM_MC_S -#endif -#ifndef BLIS_MAXIMUM_3M_KC_C -#define BLIS_MAXIMUM_3M_KC_C ((BLIS_MAXIMUM_KC_S)/2) -#endif -#ifndef BLIS_MAXIMUM_3M_NC_C -#define BLIS_MAXIMUM_3M_NC_C BLIS_MAXIMUM_NC_S -#endif - -#ifndef BLIS_MAXIMUM_3M_MC_Z -#define BLIS_MAXIMUM_3M_MC_Z BLIS_MAXIMUM_MC_D -#endif -#ifndef BLIS_MAXIMUM_3M_KC_Z -#define BLIS_MAXIMUM_3M_KC_Z ((BLIS_MAXIMUM_KC_D)/2) -#endif -#ifndef BLIS_MAXIMUM_3M_NC_Z -#define BLIS_MAXIMUM_3M_NC_Z BLIS_MAXIMUM_NC_D -#endif - - - -// -- Kernel blocksize checks -------------------------------------------------- - -// Verify that cache blocksizes are whole multiples of register blocksizes. -// Specifically, verify that: -// - MC is a whole multiple of MR *AND* NR. -// - NC is a whole multiple of NR *AND* MR. -// - KC is a whole multiple of KR *AND* both MR, NR. -// These constraints are enforced because it makes it easier to handle diagonals -// in the macro-kernel implementations. - -// -// MC must be a whole multiple of MR and NR. -// -#if ( \ - ( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \ - ( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \ - ) - #error "MC (3m) must be multiple of MR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \ - ( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \ - ) - #error "MC (3m) must be multiple of NR for all datatypes." -#endif - -// -// NC must be a whole multiple of NR and MR. -// -#if ( \ - ( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \ - ( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \ - ) - #error "NC (3m) must be multiple of NR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \ - ( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \ - ) - #error "NC (3m) must be multiple of MR for all datatypes." -#endif - -// -// KC must be a whole multiple of KR, MR, and NR. -// -#if ( \ - ( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_KR_C != 0 ) || \ - ( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_KR_Z != 0 ) \ - ) - #error "KC (3m) must be multiple of KR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \ - ( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \ - ) - #error "KC (3m) must be multiple of MR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \ - ( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \ - ) - #error "KC (3m) must be multiple of NR for all datatypes." -#endif - - - - #endif diff --git a/frame/include/bli_kernel_4m_macro_defs.h b/frame/include/bli_kernel_4m_macro_defs.h index c0cb44740..8090c3b67 100644 --- a/frame/include/bli_kernel_4m_macro_defs.h +++ b/frame/include/bli_kernel_4m_macro_defs.h @@ -188,147 +188,4 @@ -// -- Define default 4m-specific blocksize macros ------------------------------ - -// Define complex 4m register blocksizes in terms of blocksizes used for -// real kernels. - -// 4m register blocksizes -#define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S -#define BLIS_DEFAULT_4M_KR_C BLIS_DEFAULT_KR_S -#define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S - -#define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D -#define BLIS_DEFAULT_4M_KR_Z BLIS_DEFAULT_KR_D -#define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D - -// 4m packing register blocksizes -#define BLIS_PACKDIM_4M_MR_C BLIS_PACKDIM_MR_S -#define BLIS_PACKDIM_4M_KR_C BLIS_PACKDIM_KR_S -#define BLIS_PACKDIM_4M_NR_C BLIS_PACKDIM_NR_S - -#define BLIS_PACKDIM_4M_MR_Z BLIS_PACKDIM_MR_D -#define BLIS_PACKDIM_4M_KR_Z BLIS_PACKDIM_KR_D -#define BLIS_PACKDIM_4M_NR_Z BLIS_PACKDIM_NR_D - -// Define complex 4m cache blocksizes in terms of blocksizes used for -// real operations (if they have not yet already been defined). - -// 4m cache blocksizes -#ifndef BLIS_DEFAULT_4M_MC_C -#define BLIS_DEFAULT_4M_MC_C BLIS_DEFAULT_MC_S -#endif -#ifndef BLIS_DEFAULT_4M_KC_C -#define BLIS_DEFAULT_4M_KC_C ((BLIS_DEFAULT_KC_S)/2) -#endif -#ifndef BLIS_DEFAULT_4M_NC_C -#define BLIS_DEFAULT_4M_NC_C BLIS_DEFAULT_NC_S -#endif - -#ifndef BLIS_DEFAULT_4M_MC_Z -#define BLIS_DEFAULT_4M_MC_Z BLIS_DEFAULT_MC_D -#endif -#ifndef BLIS_DEFAULT_4M_KC_Z -#define BLIS_DEFAULT_4M_KC_Z ((BLIS_DEFAULT_KC_D)/2) -#endif -#ifndef BLIS_DEFAULT_4M_NC_Z -#define BLIS_DEFAULT_4M_NC_Z BLIS_DEFAULT_NC_D -#endif - -// 4m maximum cache blocksizes -#ifndef BLIS_MAXIMUM_4M_MC_C -#define BLIS_MAXIMUM_4M_MC_C BLIS_MAXIMUM_MC_S -#endif -#ifndef BLIS_MAXIMUM_4M_KC_C -#define BLIS_MAXIMUM_4M_KC_C ((BLIS_MAXIMUM_KC_S)/2) -#endif -#ifndef BLIS_MAXIMUM_4M_NC_C -#define BLIS_MAXIMUM_4M_NC_C BLIS_MAXIMUM_NC_S -#endif - -#ifndef BLIS_MAXIMUM_4M_MC_Z -#define BLIS_MAXIMUM_4M_MC_Z BLIS_MAXIMUM_MC_D -#endif -#ifndef BLIS_MAXIMUM_4M_KC_Z -#define BLIS_MAXIMUM_4M_KC_Z ((BLIS_MAXIMUM_KC_D)/2) -#endif -#ifndef BLIS_MAXIMUM_4M_NC_Z -#define BLIS_MAXIMUM_4M_NC_Z BLIS_MAXIMUM_NC_D -#endif - - - -// -- Kernel blocksize checks -------------------------------------------------- - -// Verify that cache blocksizes are whole multiples of register blocksizes. -// Specifically, verify that: -// - MC is a whole multiple of MR *AND* NR. -// - NC is a whole multiple of NR *AND* MR. -// - KC is a whole multiple of KR *AND* both MR, NR. -// These constraints are enforced because it makes it easier to handle diagonals -// in the macro-kernel implementations. - -// -// MC must be a whole multiple of MR and NR. -// - -#if ( \ - ( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \ - ( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \ - ) - #error "MC (4m) must be multiple of MR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \ - ( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \ - ) - #error "MC (4m) must be multiple of NR for all datatypes." -#endif - -// -// NC must be a whole multiple of NR and MR. -// - -#if ( \ - ( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \ - ( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \ - ) - #error "NC (4m) must be multiple of NR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \ - ( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \ - ) - #error "NC (4m) must be multiple of MR for all datatypes." -#endif - -// -// KC must be a whole multiple of KR, MR, and NR. -// - -#if ( \ - ( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_KR_C != 0 ) || \ - ( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_KR_Z != 0 ) \ - ) - #error "KC (4m) must be multiple of KR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \ - ( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \ - ) - #error "KC (4m) must be multiple of MR for all datatypes." -#endif - -#if ( \ - ( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \ - ( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \ - ) - #error "KC (4m) must be multiple of NR for all datatypes." -#endif - - - #endif diff --git a/frame/include/bli_kernel_post_macro_defs.h b/frame/include/bli_kernel_post_macro_defs.h index c23c47a38..f92144b6f 100644 --- a/frame/include/bli_kernel_post_macro_defs.h +++ b/frame/include/bli_kernel_post_macro_defs.h @@ -284,6 +284,11 @@ // prefer not to assume this, therefore, we always take the larger of the // two values. +#define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S +#define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S +#define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D +#define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D + // // Find the largest register blocksize MR. // diff --git a/frame/include/bli_mem_pool_macro_defs.h b/frame/include/bli_mem_pool_macro_defs.h index a520371c1..c30221c85 100644 --- a/frame/include/bli_mem_pool_macro_defs.h +++ b/frame/include/bli_mem_pool_macro_defs.h @@ -46,7 +46,7 @@ // the cache blocksize value of the datatype used to size the pool (e.g. // double) was not updated accordingly. -// First we compute possibly scaling factors for each datatype. These +// First we compute possible scaling factors for each datatype. These // scaling factors actually take the form of numerator and denominator // since we want stay in integer arithmetic. The purpose of the scaling // factors is to increase the amount of space we reserve for the memory @@ -65,6 +65,32 @@ // macro-kernel to reuse the existing left-side fused gemmtrsm micro-kernels. // We cross-multiply so that the comparison can stay in integer arithmetic. + +// +// Create "local" definitions for the 4m and 3m maximum cache blocksizes +// so that we can more easily show the computation of the pool dimensions +// below. +// + +// 4m maximum cache blocksizes +#define BLIS_MAXIMUM_4M_MC_C BLIS_MAXIMUM_MC_S +#define BLIS_MAXIMUM_4M_KC_C ((BLIS_MAXIMUM_KC_S)/2) +#define BLIS_MAXIMUM_4M_NC_C BLIS_MAXIMUM_NC_S + +#define BLIS_MAXIMUM_4M_MC_Z BLIS_MAXIMUM_MC_D +#define BLIS_MAXIMUM_4M_KC_Z ((BLIS_MAXIMUM_KC_D)/2) +#define BLIS_MAXIMUM_4M_NC_Z BLIS_MAXIMUM_NC_D + +// 3m maximum cache blocksizes +#define BLIS_MAXIMUM_3M_MC_C BLIS_MAXIMUM_MC_S +#define BLIS_MAXIMUM_3M_KC_C ((BLIS_MAXIMUM_KC_S)/2) +#define BLIS_MAXIMUM_3M_NC_C BLIS_MAXIMUM_NC_S + +#define BLIS_MAXIMUM_3M_MC_Z BLIS_MAXIMUM_MC_D +#define BLIS_MAXIMUM_3M_KC_Z ((BLIS_MAXIMUM_KC_D)/2) +#define BLIS_MAXIMUM_3M_NC_Z BLIS_MAXIMUM_NC_D + + // // Compute scaling factors for single real. // @@ -200,12 +226,6 @@ // Now, we compute the size of each block/panel of A, B, and C for each // datatype. -// NOTE: In defining each BLIS_*_BLOCK_SIZE_? macro below, we assume the -// "worst case" of the register blocking being unit, in which case every row -// of A and column of B would need padding to allow for alignment of every -// packed micro-panel. (This is the worst case since for MR,NR > 1, padding -// is only needed for every few rows of A and columns of B.) - // // Compute memory pool block sizes for single real. // @@ -361,8 +381,7 @@ // -- Maximum block size search ------------------------------------------------ -// In this section, we find the largest of each block size and save the result -// in a new macro for later use in bli_mem.c. +// In this section, we find the largest of each block size. // // Find the largest block size for blocks of A. @@ -468,6 +487,8 @@ // Define each pool's total size using the block sizes determined above. +// These values are used in bli_mem.c to size the static memory pool +// arrays. // // Pool for MC x KC blocks of A.