From 7cd01b71b5e757a6774625b3c9f427f5e7664a76 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 19 Jun 2015 11:31:53 -0500 Subject: [PATCH] Implemented dynamic allocation for packing buffers. Details: - Replaced the old memory allocator, which was based on statically- allocated arrays, with one based on a new internal pool_t type, which, combined with a new bli_pool_*() API, provides a new abstract data type that implements the same memory pool functionality but with blocks from the heap (ie: malloc() or equivalent). Hiding the details of the pool in a separate API also allows for a much simpler bli_mem.c family of functions. - Added a new internal header, bli_config_macro_defs.h, which enables sane defaults for the values previously found in bli_config. Those values can be overridden by #defining them in bli_config.h the same way kernel defaults can be overridden in bli_kernel.h. This file most resembles what was previously a typical configuration's bli_config.h. - Added a new configuration macro, BLIS_POOL_ADDR_ALIGN_SIZE, which defaults to BLIS_PAGE_SIZE, to specify the alignment of individual blocks in the memory pool. Also added a corresponding query routine to the bli_info API. - Deprecated (once again) the micro-panel alignment feature. Upon further reflection, it seems that the goal of more predictable L1 cache replacement behavior is outweighed by the harm caused by non-contiguous micro-panels when k % kc != 0. I honestly don't think anyone will even miss this feature. - Changed bli_ukr_get_funcs() and bli_ukr_get_ref_funcs() to call bli_cntl_init() instead of bli_init(). - Removed query functions from bli_info.c that are no longer applicable given the dynamic memory allocator. - Removed unnecessary definitions from configurations' bli_config.h files, which are now pleasantly sparse. - Fixed incorrect flop counts in addv, subv, scal2v, scal2m testsuite modules. Thanks to Devangi Parikh for pointing out these miscalculations. - Comment, whitespace changes. --- config/armv7a/bli_config.h | 134 ----- config/armv8a/bli_config.h | 133 ----- config/bgq/bli_config.h | 136 ----- config/bulldozer/bli_config.h | 136 +---- config/cortex-a15/bli_config.h | 133 ----- config/cortex-a9/bli_config.h | 133 ----- config/dunnington/bli_config.h | 137 +---- config/emscripten/bli_config.h | 133 ----- config/loongson3a/bli_config.h | 133 ----- config/mic/bli_config.h | 136 ----- config/mic/bli_kernel.h | 2 + config/piledriver/bli_config.h | 139 +---- config/pnacl/bli_config.h | 133 ----- config/power7/bli_config.h | 133 ----- config/reference/bli_config.h | 137 ----- config/sandybridge/bli_config.h | 137 ----- config/template/bli_config.h | 138 +---- frame/1m/packm/bli_packm_init.c | 49 -- frame/3/gemm/bli_gemm_cntl.c | 21 - frame/base/bli_info.c | 57 +- frame/base/bli_info.h | 25 +- frame/base/bli_init.c | 98 ++-- frame/base/bli_mem.c | 498 ++++++++++------ frame/base/bli_mem.c.prev | 366 ++++++++++++ frame/base/bli_mem.h | 18 +- frame/base/bli_pool.c | 344 +++++++++++ .../bli_pool_macro_defs.h => base/bli_pool.h} | 117 +++- frame/include/bli_config_macro_defs.h | 178 ++++++ frame/include/bli_kernel_macro_defs.h | 35 -- frame/include/bli_macro_defs.h | 6 +- frame/include/bli_mem_macro_defs.h | 40 +- frame/include/bli_mem_pool_macro_defs.h | 535 ------------------ frame/include/bli_type_defs.h | 54 +- frame/include/blis.h | 6 +- frame/ind/query/bli_bsv_query.c | 10 +- frame/ind/query/bli_ind_query.c | 4 +- frame/ind/query/bli_ukr_query.c | 16 +- testsuite/src/test_addv.c | 2 +- testsuite/src/test_libblis.c | 54 +- testsuite/src/test_scal2m.c | 2 +- testsuite/src/test_scal2v.c | 2 +- testsuite/src/test_subv.c | 2 +- 42 files changed, 1495 insertions(+), 3207 deletions(-) create mode 100644 frame/base/bli_mem.c.prev create mode 100644 frame/base/bli_pool.c rename frame/{include/bli_pool_macro_defs.h => base/bli_pool.h} (54%) create mode 100644 frame/include/bli_config_macro_defs.h delete mode 100644 frame/include/bli_mem_pool_macro_defs.h diff --git a/config/armv7a/bli_config.h b/config/armv7a/bli_config.h index a757069e1..21041fc2a 100644 --- a/config/armv7a/bli_config.h +++ b/config/armv7a/bli_config.h @@ -36,142 +36,8 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. #define BLIS_INT_TYPE_SIZE 32 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 32 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 32 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/armv8a/bli_config.h b/config/armv8a/bli_config.h index 72cc930e4..127a2bca7 100644 --- a/config/armv8a/bli_config.h +++ b/config/armv8a/bli_config.h @@ -36,142 +36,9 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. #define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. #define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/bgq/bli_config.h b/config/bgq/bli_config.h index 09396b6a0..9810b5c11 100644 --- a/config/bgq/bli_config.h +++ b/config/bgq/bli_config.h @@ -38,144 +38,8 @@ #undef restrict - -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 64 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS 1 -#define BLIS_NUM_KC_X_NC_BLOCKS 1 -#define BLIS_NUM_MC_X_NC_BLOCKS 1 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 32 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE 32 - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE 64 - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE 64 - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 32 - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -// Underscore is left out to work on BGQ systems -#define PASTEF770(name) name //## _ -#define PASTEF77(ch1,name) ch1 ## name //## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name //## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - #endif diff --git a/config/bulldozer/bli_config.h b/config/bulldozer/bli_config.h index 4b220da1e..1f99e7e53 100644 --- a/config/bulldozer/bli_config.h +++ b/config/bulldozer/bli_config.h @@ -36,141 +36,7 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 4 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - +#define BLIS_SIMD_ALIGN_SIZE 16 diff --git a/config/cortex-a15/bli_config.h b/config/cortex-a15/bli_config.h index a05f8d082..753712540 100644 --- a/config/cortex-a15/bli_config.h +++ b/config/cortex-a15/bli_config.h @@ -36,142 +36,9 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. #define BLIS_INT_TYPE_SIZE 32 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. #define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/cortex-a9/bli_config.h b/config/cortex-a9/bli_config.h index a05f8d082..753712540 100644 --- a/config/cortex-a9/bli_config.h +++ b/config/cortex-a9/bli_config.h @@ -36,142 +36,9 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. #define BLIS_INT_TYPE_SIZE 32 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. #define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/dunnington/bli_config.h b/config/dunnington/bli_config.h index 13cb2c1f6..64392de99 100644 --- a/config/dunnington/bli_config.h +++ b/config/dunnington/bli_config.h @@ -36,142 +36,7 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS 1 -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE 16 - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE 16 - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - +#define BLIS_SIMD_ALIGN_SIZE 16 #endif diff --git a/config/emscripten/bli_config.h b/config/emscripten/bli_config.h index a05f8d082..753712540 100644 --- a/config/emscripten/bli_config.h +++ b/config/emscripten/bli_config.h @@ -36,142 +36,9 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. #define BLIS_INT_TYPE_SIZE 32 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. #define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/loongson3a/bli_config.h b/config/loongson3a/bli_config.h index 42ae3a4bf..753712540 100644 --- a/config/loongson3a/bli_config.h +++ b/config/loongson3a/bli_config.h @@ -36,142 +36,9 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. #define BLIS_INT_TYPE_SIZE 32 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 8 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS 1 -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. #define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE 16 - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE 16 - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE 16 - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/mic/bli_config.h b/config/mic/bli_config.h index 8d11e5da8..a119a2dde 100644 --- a/config/mic/bli_config.h +++ b/config/mic/bli_config.h @@ -39,146 +39,10 @@ #define BLIS_TREE_BARRIER #define BLIS_TREE_BARRIER_ARITY 4 -#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS -#define BLIS_ENABLE_MULTITHREADING #define BLIS_ENABLE_OPENMP -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 60 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS 1 -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 256 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. #define BLIS_SIMD_ALIGN_SIZE 32 -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - #endif diff --git a/config/mic/bli_kernel.h b/config/mic/bli_kernel.h index 8e8679122..880e97d35 100644 --- a/config/mic/bli_kernel.h +++ b/config/mic/bli_kernel.h @@ -151,6 +151,8 @@ // -- gemm -- +#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16 diff --git a/config/piledriver/bli_config.h b/config/piledriver/bli_config.h index 6bd2262b5..dce91516d 100644 --- a/config/piledriver/bli_config.h +++ b/config/piledriver/bli_config.h @@ -36,148 +36,11 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 4 - -// Enable multithreading via POSIX threads. //#define BLIS_ENABLE_PTHREADS -// Enable multithreading via OpenMP. #define BLIS_ENABLE_OPENMP - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - +#define BLIS_SIMD_ALIGN_SIZE 16 #endif diff --git a/config/pnacl/bli_config.h b/config/pnacl/bli_config.h index a05f8d082..753712540 100644 --- a/config/pnacl/bli_config.h +++ b/config/pnacl/bli_config.h @@ -36,142 +36,9 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. #define BLIS_INT_TYPE_SIZE 32 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. #define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/power7/bli_config.h b/config/power7/bli_config.h index 5a3ce1e1b..e1725b9b0 100644 --- a/config/power7/bli_config.h +++ b/config/power7/bli_config.h @@ -36,139 +36,6 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 24 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS 1 -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE 16 - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE 16 - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - diff --git a/config/reference/bli_config.h b/config/reference/bli_config.h index 65463146c..5195e61c5 100644 --- a/config/reference/bli_config.h +++ b/config/reference/bli_config.h @@ -36,143 +36,6 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 32 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif diff --git a/config/sandybridge/bli_config.h b/config/sandybridge/bli_config.h index 1d2a4d6bb..5f66f6dae 100644 --- a/config/sandybridge/bli_config.h +++ b/config/sandybridge/bli_config.h @@ -35,42 +35,6 @@ #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H - -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 16 - // Enable multithreading via POSIX threads. //#define BLIS_ENABLE_PTHREADS @@ -79,105 +43,4 @@ -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS 2 -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 32 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - - #endif diff --git a/config/template/bli_config.h b/config/template/bli_config.h index fc65e5135..cc8e64b5a 100644 --- a/config/template/bli_config.h +++ b/config/template/bli_config.h @@ -36,142 +36,6 @@ #define BLIS_CONFIG_H -// -- OPERATING SYSTEM --------------------------------------------------------- - - - -// -- INTEGER PROPERTIES ------------------------------------------------------- - -// The bit size of the integer type used to track values such as dimensions, -// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed -// integers while 64 results in 64-bit integers. Any other value results in use -// of the C99 type "long int". Note that this ONLY affects integers used -// internally within BLIS as well as those exposed in the native BLAS-like BLIS -// interface. -#define BLIS_INT_TYPE_SIZE 64 - - - -// -- FLOATING-POINT PROPERTIES ------------------------------------------------ - -// Define the number of floating-point types supported, and the size of the -// largest type. -#define BLIS_NUM_FP_TYPES 4 -#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) - -// Enable use of built-in C99 "float complex" and "double complex" types and -// associated overloaded operations and functions? Disabling results in -// scomplex and dcomplex being defined in terms of simple structs. -//#define BLIS_ENABLE_C99_COMPLEX - - - -// -- MULTITHREADING ----------------------------------------------------------- - -// The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 - - - -// -- MEMORY ALLOCATION -------------------------------------------------------- - -// -- Contiguous (static) memory allocator -- - -// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the -// contiguous memory pools. -#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_MC_X_NC_BLOCKS 0 - -// The maximum preload byte offset is used to pad the end of the contiguous -// memory pools so that the micro-kernel, when computing with the end of the -// last block, can exceed the bounds of the usable portion of the memory -// region without causing a segmentation fault. -#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 - -// -- Memory alignment -- - -// It is sometimes useful to define the various memory alignments in terms -// of some other characteristics of the system, such as the cache line size -// and the page size. -#define BLIS_CACHE_LINE_SIZE 64 -#define BLIS_PAGE_SIZE 4096 - -// Alignment size needed by the instruction set for aligned SIMD/vector -// instructions. -#define BLIS_SIMD_ALIGN_SIZE 16 - -// Alignment size used to align local stack buffers within macro-kernel -// functions. -#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when allocating memory dynamically from the operating -// system (eg: posix_memalign()). To disable heap alignment and just use -// malloc() instead, set this to 1. -#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - -// Alignment size used when sizing leading dimensions of dynamically -// allocated memory. -#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - -// Alignment size used when allocating entire blocks of contiguous memory -// from the contiguous memory allocator. -#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE - - - -// -- MIXED DATATYPE SUPPORT --------------------------------------------------- - -// Basic (homogeneous) datatype support always enabled. - -// Enable mixed domain operations? -//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT - -// Enable extra mixed precision operations? -//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT - - - -// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- - -// Stay initialized after auto-initialization, unless and until the user -// explicitly calls bli_finalize(). -#define BLIS_ENABLE_STAY_AUTO_INITIALIZED - - - -// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- - -// Enable the BLAS compatibility layer? -#define BLIS_ENABLE_BLAS2BLIS - -// The bit size of the integer type used to track values such as dimensions and -// leading dimensions (ie: column strides) within the BLAS compatibility layer. -// A value of 32 results in the compatibility layer using 32-bit signed integers -// while 64 results in 64-bit integers. Any other value results in use of the -// C99 type "long int". Note that this ONLY affects integers used within the -// BLAS compatibility layer. -#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 - -// Fortran-77 name-mangling macros. -#define PASTEF770(name) name ## _ -#define PASTEF77(ch1,name) ch1 ## name ## _ -#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ -#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ - - - -// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ - -// Enable the CBLAS compatibility layer? -// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer -// regardless of whether or not it was explicitly enabled above. Furthermore, -// the CBLAS compatibility layer will use the integer type size definition -// specified above when defining the size of its own integers (regardless of -// whether the BLAS layer was enabled directly or indirectly). -//#define BLIS_ENABLE_CBLAS - - - #endif + diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index af8a3c32f..caa4e4839 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -164,9 +164,6 @@ void bli_packm_init( obj_t* a, } -extern blksz_t* gemm_upanel_a_align; -extern blksz_t* gemm_upanel_b_align; - void bli_packm_init_pack( invdiag_t invert_diag, pack_t schema, packord_t pack_ord_if_up, @@ -327,7 +324,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, { dim_t m_panel; dim_t ps_p, ps_p_orig; - dim_t upanel_a_align; // The panel dimension (for each datatype) should be equal to the // register blocksize in the m dimension. @@ -361,9 +357,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, // Preserve this early panel stride value for use later, if needed. ps_p_orig = ps_p; - // Query the micro-panel alignment for A. - upanel_a_align = bli_blksz_get_def( dt, gemm_upanel_a_align ); - // Here, we adjust the panel stride, if necessary. Remember: ps_p is // always interpreted as being in units of the datatype of the object // which is not necessarily how the micro-panels will be stored. For @@ -374,26 +367,12 @@ void bli_packm_init_pack( invdiag_t invert_diag, if ( bli_is_3mi_packed( schema ) ) { ps_p = ( ps_p * 3 ) / 2; - - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_a_align ); } else if ( bli_is_3ms_packed( schema ) || bli_is_ro_packed( schema ) || bli_is_io_packed( schema ) || bli_is_rpi_packed( schema ) ) { - // Acquire the element size of the the real projection of the - // current complex datatype. - siz_t elem_size_p_real = elem_size_p / 2; - - // Acquire the micro-panel alignment for the real projection of - // the current complex datatype. - upanel_a_align = bli_blksz_get_def( dt_real, gemm_upanel_a_align ); - - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p_real, upanel_a_align ); - // The division by 2 below assumes that ps_p is an even number. // However, it is possible that, at this point, ps_p is an odd. // If it is indeed odd, we nudge it higher. @@ -408,11 +387,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, // actual stored element, we divide the panel_stride by 2. ps_p = ps_p / 2; } - else - { - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_a_align ); - } // Set the imaginary stride (in units of fundamental elements) for // 3m and 4m (separated or interleaved). We use ps_p_orig since @@ -442,7 +416,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, { dim_t n_panel; dim_t ps_p, ps_p_orig; - dim_t upanel_b_align; // The panel dimension (for each datatype) should be equal to the // register blocksize in the n dimension. @@ -476,9 +449,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, // Preserve this early panel stride value for use later, if needed. ps_p_orig = ps_p; - // Query the micro-panel alignment for B. - upanel_b_align = bli_blksz_get_def( dt, gemm_upanel_b_align ); - // Here, we adjust the panel stride, if necessary. Remember: ps_p is // always interpreted as being in units of the datatype of the object // which is not necessarily how the micro-panels will be stored. For @@ -489,26 +459,12 @@ void bli_packm_init_pack( invdiag_t invert_diag, if ( bli_is_3mi_packed( schema ) ) { ps_p = ( ps_p * 3 ) / 2; - - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_b_align ); } else if ( bli_is_3ms_packed( schema ) || bli_is_ro_packed( schema ) || bli_is_io_packed( schema ) || bli_is_rpi_packed( schema ) ) { - // Acquire the element size of the the real projection of the - // current complex datatype. - siz_t elem_size_p_real = elem_size_p / 2; - - // Acquire the micro-panel alignment for the real projection of - // the current complex datatype. - upanel_b_align = bli_blksz_get_def( dt_real, gemm_upanel_b_align ); - - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p_real, upanel_b_align ); - // The division by 2 below assumes that ps_p is an even number. // However, it is possible that, at this point, ps_p is an odd. // If it is indeed odd, we nudge it higher. @@ -523,11 +479,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, // actual stored element, we divide the panel_stride by 2. ps_p = ps_p / 2; } - else - { - // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_b_align ); - } // Set the imaginary stride (in units of fundamental elements) for // 3m and 4m (separated or interleaved). We use ps_p_orig since diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 12e28f451..bf3e60b61 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -43,9 +43,6 @@ blksz_t* gemm_mr; blksz_t* gemm_nr; blksz_t* gemm_kr; -blksz_t* gemm_upanel_a_align; -blksz_t* gemm_upanel_b_align; - func_t* gemm_ukrs; func_t* gemm_ref_ukrs; @@ -100,21 +97,6 @@ void bli_gemm_cntl_init() BLIS_DEFAULT_KR_Z, BLIS_PACKDIM_KR_Z ); - // Create objects for micro-panel alignment (in bytes). - gemm_upanel_a_align - = - bli_blksz_obj_create( BLIS_UPANEL_A_ALIGN_SIZE_S, 0, - BLIS_UPANEL_A_ALIGN_SIZE_D, 0, - BLIS_UPANEL_A_ALIGN_SIZE_C, 0, - BLIS_UPANEL_A_ALIGN_SIZE_Z, 0 ); - gemm_upanel_b_align - = - bli_blksz_obj_create( BLIS_UPANEL_B_ALIGN_SIZE_S, 0, - BLIS_UPANEL_B_ALIGN_SIZE_D, 0, - BLIS_UPANEL_B_ALIGN_SIZE_C, 0, - BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 ); - - // Attach the register blksz_t objects as blocksize multiples to the cache // blksz_t objects. bli_blksz_obj_attach_mult_to( gemm_mr, gemm_mc ); @@ -249,9 +231,6 @@ void bli_gemm_cntl_finalize() bli_blksz_obj_free( gemm_nr ); bli_blksz_obj_free( gemm_kr ); - bli_blksz_obj_free( gemm_upanel_a_align ); - bli_blksz_obj_free( gemm_upanel_b_align ); - bli_func_obj_free( gemm_ukrs ); bli_func_obj_free( gemm_ref_ukrs ); diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 84b863948..941a92c3b 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -57,16 +57,12 @@ char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_s gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZE; } gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; } gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; } -gint_t bli_info_get_max_num_threads( void ) { return BLIS_MAX_NUM_THREADS; } -gint_t bli_info_get_num_mc_x_kc_blocks( void ) { return BLIS_NUM_MC_X_KC_BLOCKS; } -gint_t bli_info_get_num_kc_x_nc_blocks( void ) { return BLIS_NUM_KC_X_NC_BLOCKS; } -gint_t bli_info_get_num_mc_x_nc_blocks( void ) { return BLIS_NUM_MC_X_NC_BLOCKS; } -gint_t bli_info_get_max_preload_byte_offset( void ) { return BLIS_MAX_PRELOAD_BYTE_OFFSET; } gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; } +gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; } gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; } gint_t bli_info_get_heap_addr_align_size( void ) { return BLIS_HEAP_ADDR_ALIGN_SIZE; } gint_t bli_info_get_heap_stride_align_size( void ) { return BLIS_HEAP_STRIDE_ALIGN_SIZE; } -gint_t bli_info_get_contig_addr_align_size( void ) { return BLIS_CONTIG_ADDR_ALIGN_SIZE; } +gint_t bli_info_get_pool_addr_align_size( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE; } gint_t bli_info_get_enable_stay_auto_init( void ) { #ifdef BLIS_ENABLE_STAY_AUTO_INITIALIZED @@ -83,6 +79,14 @@ gint_t bli_info_get_enable_blas2blis( void ) return 0; #endif } +gint_t bli_info_get_enable_cblas( void ) +{ +#ifdef BLIS_ENABLE_CBLAS + return 1; +#else + return 0; +#endif +} gint_t bli_info_get_blas2blis_int_type_size( void ) { return BLIS_BLAS2BLIS_INT_TYPE_SIZE; } @@ -112,41 +116,6 @@ gint_t bli_info_get_default_kr( opid_t oper, num_t dt ) { return bli_bsv_get_ava gint_t bli_info_get_packdim_mr( opid_t oper, num_t dt ) { return bli_bsv_get_avail_blksz_max_dt( BLIS_MR, oper, dt ); } gint_t bli_info_get_packdim_nr( opid_t oper, num_t dt ) { return bli_bsv_get_avail_blksz_max_dt( BLIS_NR, oper, dt ); } -// -- Micro-panel alignment -- - -extern blksz_t* gemm_upanel_a_align; -extern blksz_t* gemm_upanel_b_align; - -// Micro-panel alignment of A - -gint_t bli_info_get_upanel_a_align_size( num_t dt ) -{ - if ( bli_is_float ( dt ) ) return bli_info_get_upanel_a_align_size_s(); - else if ( bli_is_double ( dt ) ) return bli_info_get_upanel_a_align_size_d(); - else if ( bli_is_scomplex( dt ) ) return bli_info_get_upanel_a_align_size_c(); - else if ( bli_is_dcomplex( dt ) ) return bli_info_get_upanel_a_align_size_z(); - else return 0; -} -gint_t bli_info_get_upanel_a_align_size_s( void ) { bli_init(); return bli_blksz_get_def( BLIS_FLOAT, gemm_upanel_a_align ); } -gint_t bli_info_get_upanel_a_align_size_d( void ) { bli_init(); return bli_blksz_get_def( BLIS_DOUBLE, gemm_upanel_a_align ); } -gint_t bli_info_get_upanel_a_align_size_c( void ) { bli_init(); return bli_blksz_get_def( BLIS_SCOMPLEX, gemm_upanel_a_align ); } -gint_t bli_info_get_upanel_a_align_size_z( void ) { bli_init(); return bli_blksz_get_def( BLIS_DCOMPLEX, gemm_upanel_a_align ); } - -// Micro-panel alignment of B - -gint_t bli_info_get_upanel_b_align_size( num_t dt ) -{ - if ( bli_is_float ( dt ) ) return bli_info_get_upanel_b_align_size_s(); - else if ( bli_is_double ( dt ) ) return bli_info_get_upanel_b_align_size_d(); - else if ( bli_is_scomplex( dt ) ) return bli_info_get_upanel_b_align_size_c(); - else if ( bli_is_dcomplex( dt ) ) return bli_info_get_upanel_b_align_size_z(); - else return 0; -} -gint_t bli_info_get_upanel_b_align_size_s( void ) { bli_init(); return bli_blksz_get_def( BLIS_FLOAT, gemm_upanel_b_align ); } -gint_t bli_info_get_upanel_b_align_size_d( void ) { bli_init(); return bli_blksz_get_def( BLIS_DOUBLE, gemm_upanel_b_align ); } -gint_t bli_info_get_upanel_b_align_size_c( void ) { bli_init(); return bli_blksz_get_def( BLIS_SCOMPLEX, gemm_upanel_b_align ); } -gint_t bli_info_get_upanel_b_align_size_z( void ) { bli_init(); return bli_blksz_get_def( BLIS_DCOMPLEX, gemm_upanel_b_align ); } - // -- Level-2 cache blocksizes -- @@ -268,9 +237,9 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) // -- bli_mem_pool_macro_defs.h ------------------------------------------------ -gint_t bli_info_get_mk_pool_size( void ) { return BLIS_MK_POOL_SIZE; } -gint_t bli_info_get_kn_pool_size( void ) { return BLIS_KN_POOL_SIZE; } -gint_t bli_info_get_mn_pool_size( void ) { return BLIS_MN_POOL_SIZE; } +gint_t bli_info_get_mk_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_A_BLOCK ); } +gint_t bli_info_get_kn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_B_PANEL ); } +gint_t bli_info_get_mn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_C_PANEL ); } diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index cf8437c41..7d6d80d0a 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -44,18 +44,15 @@ char* bli_info_get_int_type_size_str( void ); gint_t bli_info_get_int_type_size( void ); gint_t bli_info_get_num_fp_types( void ); gint_t bli_info_get_max_type_size( void ); -gint_t bli_info_get_max_num_threads( void ); -gint_t bli_info_get_num_mc_x_kc_blocks( void ); -gint_t bli_info_get_num_kc_x_nc_blocks( void ); -gint_t bli_info_get_num_mc_x_nc_blocks( void ); -gint_t bli_info_get_max_preload_byte_offset( void ); gint_t bli_info_get_simd_align_size( void ); +gint_t bli_info_get_page_size( void ); gint_t bli_info_get_stack_buf_align_size( void ); gint_t bli_info_get_heap_addr_align_size( void ); gint_t bli_info_get_heap_stride_align_size( void ); -gint_t bli_info_get_contig_addr_align_size( void ); +gint_t bli_info_get_pool_addr_align_size( void ); gint_t bli_info_get_enable_stay_auto_init( void ); gint_t bli_info_get_enable_blas2blis( void ); +gint_t bli_info_get_enable_cblas( void ); gint_t bli_info_get_blas2blis_int_type_size( void ); @@ -84,22 +81,6 @@ gint_t bli_info_get_default_nr( opid_t oper, num_t dt ); gint_t bli_info_get_packdim_mr( opid_t oper, num_t dt ); gint_t bli_info_get_packdim_nr( opid_t oper, num_t dt ); -// -- Micro-panel alignment for A -- - -gint_t bli_info_get_upanel_a_align_size( num_t dt ); -gint_t bli_info_get_upanel_a_align_size_s( void ); -gint_t bli_info_get_upanel_a_align_size_d( void ); -gint_t bli_info_get_upanel_a_align_size_c( void ); -gint_t bli_info_get_upanel_a_align_size_z( void ); - -// -- Micro-panel alignment for B -- - -gint_t bli_info_get_upanel_b_align_size( num_t dt ); -gint_t bli_info_get_upanel_b_align_size_s( void ); -gint_t bli_info_get_upanel_b_align_size_d( void ); -gint_t bli_info_get_upanel_b_align_size_c( void ); -gint_t bli_info_get_upanel_b_align_size_z( void ); - // -- Level-2 cache blocksizes -- diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 7a0c0f39b..6e793fa40 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -63,41 +63,44 @@ err_t bli_init( void ) // reasons), the conditional test below MUST be within the critical // section to prevent a race condition of the type described above. - // BEGIN CRITICAL SECTION #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (init)" ) #endif #ifdef BLIS_ENABLE_PTHREADS pthread_mutex_lock( &initialize_mutex ); #endif + + // BEGIN CRITICAL SECTION { - // Proceed with initialization only if BLIS is presently uninitialized. - // Since we bli_init() and bli_finalize() use the same named critical - // section, we can be sure that no other thread is either (a) updating - // bli_is_init, or (b) testing bli_is_init within the critical - // section (for the purposes of deciding whether to perform the - // necessary initialization subtasks). - if ( bli_is_init == FALSE ) - { - // Initialize various sub-APIs. - bli_const_init(); - bli_cntl_init(); - bli_error_init(); - bli_mem_init(); - bli_ind_init(); - bli_thread_init(); + // Proceed with initialization only if BLIS is presently uninitialized. + // Since we bli_init() and bli_finalize() use the same named critical + // section, we can be sure that no other thread is either (a) updating + // bli_is_init, or (b) testing bli_is_init within the critical section + // (for the purposes of deciding whether to perform the necessary + // initialization subtasks). + if ( bli_is_init == FALSE ) + { + // Initialize various sub-APIs. + bli_const_init(); + bli_cntl_init(); + bli_error_init(); + bli_mem_init(); + bli_ind_init(); + bli_thread_init(); - // After initialization is complete, mark BLIS as initialized. - bli_is_init = TRUE; + // After initialization is complete, mark BLIS as initialized. + bli_is_init = TRUE; - // Only the thread that actually performs the initialization will - // return "success". - r_val = BLIS_SUCCESS; + //bli_mem_init(); + + // Only the thread that actually performs the initialization will + // return "success". + r_val = BLIS_SUCCESS; + } } - // END CRITICAL SECTION - } + #ifdef BLIS_ENABLE_PTHREADS pthread_mutex_unlock( &initialize_mutex ); #endif @@ -127,41 +130,42 @@ err_t bli_finalize( void ) // reasons), the conditional test below MUST be within the critical // section to prevent a race condition of the type described above. - // BEGIN CRITICAL SECTION #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (init)" ) #endif #ifdef BLIS_ENABLE_PTHREADS pthread_mutex_lock( &initialize_mutex ); #endif + + // BEGIN CRITICAL SECTION { - // Proceed with finalization only if BLIS is presently initialized. - // Since we bli_init() and bli_finalize() use the same named critical - // section, we can be sure that no other thread is either (a) updating - // bli_is_init, or (b) testing bli_is_init within the critical - // section (for the purposes of deciding whether to perform the - // necessary finalization subtasks). - if ( bli_is_init == TRUE ) - { - // Finalize various sub-APIs. - bli_const_finalize(); - bli_cntl_finalize(); - bli_error_finalize(); - bli_mem_finalize(); - bli_ind_finalize(); - bli_thread_finalize(); + // Proceed with finalization only if BLIS is presently initialized. + // Since we bli_init() and bli_finalize() use the same named critical + // section, we can be sure that no other thread is either (a) updating + // bli_is_init, or (b) testing bli_is_init within the critical section + // (for the purposes of deciding whether to perform the necessary + // finalization subtasks). + if ( bli_is_init == TRUE ) + { + // Finalize various sub-APIs. + bli_const_finalize(); + bli_cntl_finalize(); + bli_error_finalize(); + bli_mem_finalize(); + bli_ind_finalize(); + bli_thread_finalize(); - // After finalization is complete, mark BLIS as uninitialized. - bli_is_init = FALSE; + // After finalization is complete, mark BLIS as uninitialized. + bli_is_init = FALSE; - // Only the thread that actually performs the finalization will - // return "success". - r_val = BLIS_SUCCESS; + // Only the thread that actually performs the finalization will + // return "success". + r_val = BLIS_SUCCESS; + } } - // END CRITICAL SECTION - } + #ifdef BLIS_ENABLE_PTHREADS pthread_mutex_unlock( &initialize_mutex ); #endif @@ -178,6 +182,8 @@ bool_t bli_is_initialized( void ) return bli_is_init; } +// ----------------------------------------------------------------------------- + void bli_init_auto( err_t* init_result ) { *init_result = bli_init(); diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index 6f13d3c1a..81a7a2c3b 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -34,101 +34,71 @@ #include "blis.h" -static bool_t bli_mem_is_init = FALSE; - #ifdef BLIS_ENABLE_PTHREADS pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; #endif // Declare one memory pool structure for each block size/shape we want to // be able to allocate. - static pool_t pools[3]; -// Physically contiguous memory for each pool. -// -// Generally speaking, the pool sizes are computed in a sub-header of blis.h -// as follows: -// -// BLIS_MK_POOL_SIZE = BLIS_MAXIMUM_MC_? * BLIS_MAXIMUM_KC_? * BLIS_SIZEOF_? -// -// where "?" is the datatype that results in the largest pool size. The -// constants BLIS_KN_POOL_SIZE and BLIS_MN_POOL_SIZE are computed in a -// similar manner. All constants are computed with appropriate "padding" -// to ensure enough space given the alignments required by bli_config.h. -// - -static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ]; -static char pool_mk_mem[ BLIS_MK_POOL_SIZE ]; - -static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ]; -static char pool_kn_mem[ BLIS_KN_POOL_SIZE ]; - -static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ]; -static char pool_mn_mem[ BLIS_MN_POOL_SIZE ]; - - - +// ----------------------------------------------------------------------------- void bli_mem_acquire_m( siz_t req_size, packbuf_t buf_type, mem_t* mem ) { - siz_t block_size; - dim_t pool_index; pool_t* pool; - void** block_ptrs; - void* block; - gint_t i; + pblk_t* pblk; + dim_t pi; + siz_t block_size; + // Make sure the API is initialized. + bli_mem_init(); if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { // For general-use buffer requests, such as those used by level-2 - // operations, using bli_malloc() is sufficient, since using - // physically contiguous memory is not as important there. - block = bli_malloc( req_size ); + // operations, using bli_malloc() is sufficient. + void* buf_sys = bli_malloc( req_size ); // Initialize the mem_t object with: // - the address of the memory block, // - the buffer type (a packbuf_t value), and // - the size of the requested region. // NOTE: We do not initialize the pool field since this block did not - // come from a contiguous memory pool. - bli_mem_set_buffer( block, mem ); + // come from a memory pool. + bli_mem_set_buffer( buf_sys, mem ); + bli_mem_set_buf_sys( buf_sys, mem ); bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_size( req_size, mem ); } else { // This branch handles cases where the memory block needs to come - // from one of the contiguous memory pools. + // from an internal memory pool, in which blocks are allocated once + // and then recycled. // Map the requested packed buffer type to a zero-based index, which // we then use to select the corresponding memory pool. - pool_index = bli_packbuf_index( buf_type ); - pool = &pools[ pool_index ]; + pi = bli_packbuf_index( buf_type ); + pool = &pools[ pi ]; // Unconditionally perform error checking on the memory pool. { err_t e_val; // Make sure that the requested matrix size fits inside of a block - // of the corresponding pool. + // of the corresponding pool. If it does not, the pool was somehow + // initialized improperly. e_val = bli_check_requested_block_size_for_pool( req_size, pool ); bli_check_error_code( e_val ); - - // Make sure that the pool contains at least one block to check out - // to the thread. - e_val = bli_check_if_exhausted_pool( pool ); - bli_check_error_code( e_val ); } - // Access the block pointer array from the memory pool data structure. - block_ptrs = bli_pool_block_ptrs( pool ); - + // Extract the address of the pblk_t struct within the mem_t. + pblk = bli_mem_pblk( mem ); #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (mem)" ) @@ -139,20 +109,13 @@ void bli_mem_acquire_m( siz_t req_size, // BEGIN CRITICAL SECTION { - // Query the index of the contiguous memory block that resides at the - // "top" of the pool. - i = bli_pool_top_index( pool ); - - // Extract the address of the top block from the block pointer array. - block = block_ptrs[i]; - - // Clear the entry from the block pointer array. (This is actually not - // necessary.) - //block_ptrs[i] = NULL; - - // Decrement the top of the memory pool. - bli_pool_dec_top_index( pool ); - + // Checkout a block from the pool. If the pool is exhausted, + // either because it is still empty or because all blocks have + // been checked out already, additional blocks will be allocated + // automatically, as-needed. Note that the addresses are stored + // directly into the mem_t struct since pblk is the address of + // the struct's pblk_t field. + bli_pool_checkout_block( pblk, pool ); } // END CRITICAL SECTION @@ -162,16 +125,17 @@ void bli_mem_acquire_m( siz_t req_size, #endif // Query the size of the blocks in the pool so we can store it in the - // mem_t object. + // mem_t object. At this point, it is guaranteed to be at least as + // large as req_size. block_size = bli_pool_block_size( pool ); // Initialize the mem_t object with: - // - the address of the memory block, // - the buffer type (a packbuf_t value), // - the address of the memory pool to which it belongs, and // - the size of the contiguous memory block (NOT the size of the // requested region). - bli_mem_set_buffer( block, mem ); + // The actual addresses (system and aligned) are already stored in + // the mem_t struct's pblk_t field bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_pool( pool, mem ); bli_mem_set_size( block_size, mem ); @@ -183,33 +147,30 @@ void bli_mem_release( mem_t* mem ) { packbuf_t buf_type; pool_t* pool; - void** block_ptrs; - void* block; - gint_t i; + pblk_t* pblk; - // Extract the address of the memory block we are trying to - // release. - block = bli_mem_buffer( mem ); + // Make sure the API is initialized. + bli_mem_init(); // Extract the buffer type so we know what kind of memory was allocated. buf_type = bli_mem_buf_type( mem ); if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { + void* buf_sys = bli_mem_buf_sys( mem ); + // For general-use buffers, we allocate with bli_malloc(), and so // here we need to call bli_free(). - bli_free( block ); + bli_free( buf_sys ); } else { - // This branch handles cases where the memory block came from one - // of the contiguous memory pools. - - // Extract the pool from which the block was allocated. + // Extract the address of the pool from which the memory was + // allocated. pool = bli_mem_pool( mem ); - // Extract the block pointer array associated with the pool. - block_ptrs = bli_pool_block_ptrs( pool ); + // Extract the address of the pblk_t struct within the mem_t struct. + pblk = bli_mem_pblk( mem ); #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (mem)" ) @@ -221,14 +182,8 @@ void bli_mem_release( mem_t* mem ) // BEGIN CRITICAL SECTION { - // Increment the top of the memory pool. - bli_pool_inc_top_index( pool ); - - // Query the newly incremented top index. - i = bli_pool_top_index( pool ); - - // Place the address of the block back onto the top of the memory pool. - block_ptrs[i] = block; + // Check the block back into the pool. + bli_pool_checkin_block( pblk, pool ); } // END CRITICAL SECTION @@ -238,16 +193,13 @@ void bli_mem_release( mem_t* mem ) #endif } - - // Clear the mem_t object so that it appears unallocated. We clear: - // - the buffer field, - // - the pool field, and - // - the size field. + // Clear the mem_t object so that it appears unallocated. This clears: + // - the pblk_t struct's fields (ie: the buffer addresses) + // - the pool field + // - the size field // NOTE: We do not clear the buf_type field since there is no // "uninitialized" value for packbuf_t. - bli_mem_set_buffer( NULL, mem ); - bli_mem_set_pool( NULL, mem ); - bli_mem_set_size( 0, mem ); + bli_mem_clear( mem ); } @@ -260,20 +212,44 @@ void bli_mem_acquire_v( siz_t req_size, } +siz_t bli_mem_pool_size( packbuf_t buf_type ) +{ + siz_t r_val; + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // We don't (yet) track the amount of general-purpose + // memory that is currently allocated. + r_val = 0; + } + else + { + dim_t index; + pool_t* pool; + + // Acquire the pointer to the pool corresponding to the buf_type + // provided. + index = bli_packbuf_index( buf_type ); + pool = &(pools[index]); + + // Compute the pool "size" as the product of the block size + // and the number of blocks in the pool. + r_val = bli_pool_block_size( pool ) * + bli_pool_num_blocks( pool ); + } + + return r_val; +} + +// ----------------------------------------------------------------------------- + +static bool_t bli_mem_is_init = FALSE; void bli_mem_init( void ) { - // If the API is already initialized, return early. - if ( bli_mem_is_initialized() ) return; - - dim_t index_a; - dim_t index_b; - dim_t index_c; - - // Map each of the packbuf_t values to an index starting at zero. - index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + // If the initialization flag is TRUE, we know the API is already + // initialized, so we can return early. + if ( bli_mem_is_init == TRUE ) return; #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (mem)" ) @@ -284,91 +260,32 @@ void bli_mem_init( void ) // BEGIN CRITICAL SECTION { + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // initialization actions once they are finally allowed into this + // critical section. + if ( bli_mem_is_init == FALSE ) + { + // Initialize the memory pools. + bli_mem_init_pools(); - // Initialize contiguous memory pool for MC x KC blocks. - bli_mem_init_pool( pool_mk_mem, - BLIS_MK_BLOCK_SIZE, - BLIS_NUM_MC_X_KC_BLOCKS, - pool_mk_blk_ptrs, - &pools[ index_a ] ); - - // Initialize contiguous memory pool for KC x NC blocks. - bli_mem_init_pool( pool_kn_mem, - BLIS_KN_BLOCK_SIZE, - BLIS_NUM_KC_X_NC_BLOCKS, - pool_kn_blk_ptrs, - &pools[ index_b ] ); - - // Initialize contiguous memory pool for MC x NC blocks. - bli_mem_init_pool( pool_mn_mem, - BLIS_MN_BLOCK_SIZE, - BLIS_NUM_MC_X_NC_BLOCKS, - pool_mn_blk_ptrs, - &pools[ index_c ] ); - + // After initialization, mark the API as initialized. + bli_mem_is_init = TRUE; + } } // END CRITICAL SECTION #ifdef BLIS_ENABLE_PTHREADS pthread_mutex_unlock( &mem_manager_mutex ); #endif - - // Mark API as initialized. - bli_mem_is_init = TRUE; } - -void bli_mem_init_pool( char* pool_mem, - siz_t block_size, - dim_t num_blocks, - void** block_ptrs, - pool_t* pool ) -{ - const siz_t align_size = BLIS_CONTIG_ADDR_ALIGN_SIZE; - dim_t i; - - // If the pool starting address is not already aligned, advance it - // accordingly. - if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) - { - // Notice that this works even if the alignment is not a power of two. - pool_mem += ( ( uintptr_t )align_size - - ( ( uintptr_t )pool_mem % align_size ) ); - } - - // Step through the memory pool, beginning with the aligned address - // determined above, assigning pointers to the beginning of each block_size - // bytes to the ith element of the block_ptrs array. - for ( i = 0; i < num_blocks; ++i ) - { - // Save the address of pool, which is guaranteed to be aligned. - block_ptrs[i] = pool_mem; - - // Advance pool by one block. - pool_mem += block_size; - - // Advance pool a bit further if needed in order to get to the - // beginning of an alignment boundary. - if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) - { - pool_mem += ( ( uintptr_t )align_size - - ( ( uintptr_t )pool_mem % align_size ) ); - } - } - - // Now that we have initialized the array of pointers to the individual - // blocks in the pool, we initialize a pool_t data structure so that we - // can easily manage this pool. - bli_pool_init( num_blocks, - block_size, - block_ptrs, - pool ); -} - - - void bli_mem_finalize( void ) { + // If the initialization flag is FALSE, we know the API is already + // uninitialized, so we can return early. + if ( bli_mem_is_init == FALSE ) return; #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (mem)" ) @@ -379,20 +296,25 @@ void bli_mem_finalize( void ) // BEGIN CRITICAL SECTION { - // Do nothing. + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // finalization actions once they are finally allowed into this + // critical section. + if ( bli_mem_is_init == TRUE ) + { + // Finalize the memory pools. + bli_mem_finalize_pools(); + + // After finalization, mark the API as uninitialized. + bli_mem_is_init = FALSE; + } } // END CRITICAL SECTION #ifdef BLIS_ENABLE_PTHREADS pthread_mutex_unlock( &mem_manager_mutex ); #endif - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_destroy( &mem_manager_mutex ); -#endif - - // Mark API as uninitialized. - bli_mem_is_init = FALSE; } bool_t bli_mem_is_initialized( void ) @@ -400,3 +322,203 @@ bool_t bli_mem_is_initialized( void ) return bli_mem_is_init; } +// ----------------------------------------------------------------------------- + +void bli_mem_init_pools( void ) +{ + // Map each of the packbuf_t values to an index starting at zero. + dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + dim_t num_blocks_a = 0; + dim_t num_blocks_b = 0; + dim_t num_blocks_c = 0; + + siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; + + siz_t block_size_a = 0; + siz_t block_size_b = 0; + siz_t block_size_c = 0; + + ind_t im; + num_t dt; + + // Compute pool block sizes for datatype and each implemented + // method and find the maximum size for each pool. This is done + // so that new pools do not need to be allocated if the user + // switches datatypes or methods later on. + for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im ) + { + for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + { + // Avoid considering induced methods for real datatypes. + if ( bli_is_complex( dt ) || im == BLIS_NAT ) + { + siz_t bs_a, bs_b, bs_c; +//printf( "bli_mem_init_pools: considering dt %lu im %lu\n", dt, im ); + bli_mem_compute_pool_block_sizes_dt( dt, im, + &bs_a, + &bs_b, + &bs_c ); + +//printf( "bli_mem_init_pools: bs a b c = %lu %lu %lu\n", bs_a, bs_b, bs_c ); + block_size_a = bli_max( bs_a, block_size_a ); + block_size_b = bli_max( bs_b, block_size_b ); + block_size_c = bli_max( bs_c, block_size_c ); + } + } + } + + // Initialize the memory pools for A, B, and C. + bli_pool_init( num_blocks_a, block_size_a, align_size, &pools[ index_a ] ); + bli_pool_init( num_blocks_b, block_size_b, align_size, &pools[ index_b ] ); + bli_pool_init( num_blocks_c, block_size_c, align_size, &pools[ index_c ] ); +} + +void bli_mem_finalize_pools( void ) +{ + // Map each of the packbuf_t values to an index starting at zero. + dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + // Finalize the memory pools for A, B, and C. + bli_pool_finalize( &pools[ index_a ] ); + bli_pool_finalize( &pools[ index_b ] ); + bli_pool_finalize( &pools[ index_c ] ); +} + +// ----------------------------------------------------------------------------- + +void bli_mem_compute_pool_block_sizes_dt( num_t dt, + ind_t method, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c ) +{ + siz_t size_dt = bli_datatype_size( dt ); + + blksz_t* mr; + blksz_t* nr; + + blksz_t* mc; + blksz_t* kc; + blksz_t* nc; + + dim_t mr_dt; + dim_t nr_dt; + dim_t max_mnr_dt; + + dim_t mc_max_dt; + dim_t kc_max_dt; + dim_t nc_max_dt; + + dim_t packmr_dt, packnr_dt; + dim_t max_packmnr_dt; + + dim_t scale_num_dt; + dim_t scale_den_dt; + + dim_t pool_mc_dt, left_mc_dt; + dim_t pool_nc_dt, left_nc_dt; + dim_t pool_kc_dt; + + // + // Find the larger of the two register blocksizes. + // + + // Query the mr and nr blksz_t objects for the given method of + // execution. + mr = bli_bsv_get_blksz( BLIS_MR, method ); + nr = bli_bsv_get_blksz( BLIS_NR, method ); + + // Extract the mr and nr values specific to the current datatype. + mr_dt = bli_blksz_get_def( dt, mr ); + nr_dt = bli_blksz_get_def( dt, nr ); + + // Find the maximum of mr and nr. + max_mnr_dt = bli_max( mr_dt, nr_dt ); + + // + // Define local maximum cache blocksizes. + // + + // Query the mc, kc, and nc blksz_t objects for native execution. + mc = bli_bsv_get_blksz( BLIS_MC, method ); + kc = bli_bsv_get_blksz( BLIS_KC, method ); + nc = bli_bsv_get_blksz( BLIS_NC, method ); + + // Extract the maximum mc, kc, and nc values specific to the current + // datatype. + mc_max_dt = bli_blksz_get_max( dt, mc ); + kc_max_dt = bli_blksz_get_max( dt, kc ); + nc_max_dt = bli_blksz_get_max( dt, nc ); + + // Add max(mr,nr) to kc to make room for the nudging of kc at + // runtime to be a multiple of mr or nr for triangular operations + // trmm, trmm3, and trsm. + kc_max_dt += max_mnr_dt; + + // + // Compute scaling factors. + // + + // Compute integer scaling factors (numerator and denominator) used + // to account for situations when the packing register blocksizes are + // larger than the regular register blocksizes. + + // In order to compute the scaling factors, we first have to determine + // whether ( packmr / mr ) is greater than ( packnr / nr ). This is + // needed ONLY because the amount of space allocated for a block of A + // and a panel of B needs to be such that MR and NR can be swapped (ie: + // A is packed with NR and B is packed with MR). This transformation is + // needed for right-side trsm when inducing an algorithm that (a) has + // favorable access patterns for column-stored C and (b) allows the + // macro-kernel to reuse the existing left-side fused gemmtrsm micro- + // kernels. We avoid integer division by cross-multiplying: + // + // ( packmr / mr ) >= ( packnr / nr ) + // ( packmr / mr ) * nr >= packnr + // packmr * nr >= packnr * mr + // + // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as + // our scaling factors. Otherwise, we'll use packnr and nr. + + packmr_dt = bli_blksz_get_max( dt, mr ); + packnr_dt = bli_blksz_get_max( dt, nr ); + + if ( packmr_dt * nr_dt >= + packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; + scale_den_dt = mr_dt; } + else { scale_num_dt = packnr_dt; + scale_den_dt = nr_dt; } + + // + // Compute pool block dimensions. + // + + pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; + left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; + + pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; + left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; + + pool_kc_dt = ( kc_max_dt ); + + if ( left_mc_dt > 0 ) pool_mc_dt += 1; + if ( left_nc_dt > 0 ) pool_nc_dt += 1; + + // + // Compute pool block sizes + // + + // We add an extra micro-panel of space to the block sizes for A and B + // just to be sure any pre-loading performed by the micro-kernel does + // not cause a segmentation fault. + max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); + + *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; + *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; + *bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt; +} diff --git a/frame/base/bli_mem.c.prev b/frame/base/bli_mem.c.prev new file mode 100644 index 000000000..7a16e8732 --- /dev/null +++ b/frame/base/bli_mem.c.prev @@ -0,0 +1,366 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS +extern pthread_mutex_t mem_manager_mutex; +#endif + +// Declare one memory pool structure for each block size/shape we want to +// be able to allocate. + +static pool_t pools[3]; + + +// Physically contiguous memory for each pool. +// +// Generally speaking, the pool sizes are computed in a sub-header of blis.h +// as follows: +// +// BLIS_MK_POOL_SIZE = BLIS_MAXIMUM_MC_? * BLIS_MAXIMUM_KC_? * BLIS_SIZEOF_? +// +// where "?" is the datatype that results in the largest pool size. The +// constants BLIS_KN_POOL_SIZE and BLIS_MN_POOL_SIZE are computed in a +// similar manner. All constants are computed with appropriate "padding" +// to ensure enough space given the alignments required by bli_config.h. +// + +static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ]; +static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ]; +static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ]; + +#define BLIS_USE_HEAP + +#ifdef BLIS_USE_HEAP +static char* pool_mk_mem = NULL; +static char* pool_kn_mem = NULL; +static char* pool_mn_mem = NULL; +#else +static char pool_mk_mem[ BLIS_MK_POOL_SIZE ]; +static char pool_kn_mem[ BLIS_KN_POOL_SIZE ]; +static char pool_mn_mem[ BLIS_MN_POOL_SIZE ]; +#endif + + + +void bli_mem_acquire_m( siz_t req_size, + packbuf_t buf_type, + mem_t* mem ) +{ + siz_t block_size; + dim_t pool_index; + pool_t* pool; + void** block_ptrs; + void* block; + gint_t i; + + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffer requests, such as those used by level-2 + // operations, using bli_malloc() is sufficient, since using + // physically contiguous memory is not as important there. + block = bli_malloc( req_size ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), and + // - the size of the requested region. + // NOTE: We do not initialize the pool field since this block did not + // come from a contiguous memory pool. + bli_mem_set_buffer( block, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_size( req_size, mem ); + } + else + { + // This branch handles cases where the memory block needs to come + // from one of the contiguous memory pools. + + // Map the requested packed buffer type to a zero-based index, which + // we then use to select the corresponding memory pool. + pool_index = bli_packbuf_index( buf_type ); + pool = &pools[ pool_index ]; + + // Unconditionally perform error checking on the memory pool. + { + err_t e_val; + + // Make sure that the requested matrix size fits inside of a block + // of the corresponding pool. + e_val = bli_check_requested_block_size_for_pool( req_size, pool ); + bli_check_error_code( e_val ); + + // Make sure that the pool contains at least one block to check out + // to the thread. + e_val = bli_check_if_exhausted_pool( pool ); + bli_check_error_code( e_val ); + } + + // Access the block pointer array from the memory pool data structure. + block_ptrs = bli_pool_block_ptrs( pool ); + + + // BEGIN CRITICAL SECTION +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + { + + // Query the index of the contiguous memory block that resides at the + // "top" of the pool. + i = bli_pool_top_index( pool ); + + // Extract the address of the top block from the block pointer array. + block = block_ptrs[i]; + + // Clear the entry from the block pointer array. (This is actually not + // necessary.) + //block_ptrs[i] = NULL; + + // Decrement the top of the memory pool. + bli_pool_dec_top_index( pool ); + + + // END CRITICAL SECTION + } +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + + // Query the size of the blocks in the pool so we can store it in the + // mem_t object. + block_size = bli_pool_block_size( pool ); + + // Initialize the mem_t object with: + // - the address of the memory block, + // - the buffer type (a packbuf_t value), + // - the address of the memory pool to which it belongs, and + // - the size of the contiguous memory block (NOT the size of the + // requested region). + bli_mem_set_buffer( block, mem ); + bli_mem_set_buf_type( buf_type, mem ); + bli_mem_set_pool( pool, mem ); + bli_mem_set_size( block_size, mem ); + } +} + + +void bli_mem_release( mem_t* mem ) +{ + packbuf_t buf_type; + pool_t* pool; + void** block_ptrs; + void* block; + gint_t i; + + // Extract the address of the memory block we are trying to + // release. + block = bli_mem_buffer( mem ); + + // Extract the buffer type so we know what kind of memory was allocated. + buf_type = bli_mem_buf_type( mem ); + + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) + { + // For general-use buffers, we allocate with bli_malloc(), and so + // here we need to call bli_free(). + bli_free( block ); + } + else + { + // This branch handles cases where the memory block came from one + // of the contiguous memory pools. + + // Extract the pool from which the block was allocated. + pool = bli_mem_pool( mem ); + + // Extract the block pointer array associated with the pool. + block_ptrs = bli_pool_block_ptrs( pool ); + + + // BEGIN CRITICAL SECTION +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + { + + // Increment the top of the memory pool. + bli_pool_inc_top_index( pool ); + + // Query the newly incremented top index. + i = bli_pool_top_index( pool ); + + // Place the address of the block back onto the top of the memory pool. + block_ptrs[i] = block; + + + // END CRITICAL SECTION + } +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + } + + + // Clear the mem_t object so that it appears unallocated. We clear: + // - the buffer field, + // - the pool field, and + // - the size field. + // NOTE: We do not clear the buf_type field since there is no + // "uninitialized" value for packbuf_t. + bli_mem_set_buffer( NULL, mem ); + bli_mem_set_pool( NULL, mem ); + bli_mem_set_size( 0, mem ); +} + + +void bli_mem_acquire_v( siz_t req_size, + mem_t* mem ) +{ + bli_mem_acquire_m( req_size, + BLIS_BUFFER_FOR_GEN_USE, + mem ); +} + + + +void bli_mem_init() +{ + dim_t index_a; + dim_t index_b; + dim_t index_c; + +#ifdef BLIS_USE_HEAP + pool_mk_mem = bli_malloc( BLIS_MK_POOL_SIZE ); + pool_kn_mem = bli_malloc( BLIS_KN_POOL_SIZE ); + pool_mn_mem = bli_malloc( BLIS_MN_POOL_SIZE ); +#endif + + // Map each of the packbuf_t values to an index starting at zero. + index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); + index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); + index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); + + // Initialize contiguous memory pool for MC x KC blocks. + bli_mem_init_pool( pool_mk_mem, + BLIS_MK_BLOCK_SIZE, + BLIS_NUM_MC_X_KC_BLOCKS, + pool_mk_blk_ptrs, + &pools[ index_a ] ); + + // Initialize contiguous memory pool for KC x NC blocks. + bli_mem_init_pool( pool_kn_mem, + BLIS_KN_BLOCK_SIZE, + BLIS_NUM_KC_X_NC_BLOCKS, + pool_kn_blk_ptrs, + &pools[ index_b ] ); + + // Initialize contiguous memory pool for MC x NC blocks. + bli_mem_init_pool( pool_mn_mem, + BLIS_MN_BLOCK_SIZE, + BLIS_NUM_MC_X_NC_BLOCKS, + pool_mn_blk_ptrs, + &pools[ index_c ] ); +} + + +void bli_mem_init_pool( char* pool_mem, + siz_t block_size, + dim_t num_blocks, + void** block_ptrs, + pool_t* pool ) +{ + const siz_t align_size = BLIS_CONTIG_ADDR_ALIGN_SIZE; + dim_t i; + + // If the pool starting address is not already aligned, advance it + // accordingly. + if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) + { + // Notice that this works even if the alignment is not a power of two. + pool_mem += ( ( uintptr_t )align_size - + ( ( uintptr_t )pool_mem % align_size ) ); + } + + // Step through the memory pool, beginning with the aligned address + // determined above, assigning pointers to the beginning of each block_size + // bytes to the ith element of the block_ptrs array. + for ( i = 0; i < num_blocks; ++i ) + { + // Save the address of pool, which is guaranteed to be aligned. + block_ptrs[i] = pool_mem; + + // Advance pool by one block. + pool_mem += block_size; + + // Advance pool a bit further if needed in order to get to the + // beginning of an alignment boundary. + if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) + { + pool_mem += ( ( uintptr_t )align_size - + ( ( uintptr_t )pool_mem % align_size ) ); + } + } + + // Now that we have initialized the array of pointers to the individual + // blocks in the pool, we initialize a pool_t data structure so that we + // can easily manage this pool. + bli_pool_init( num_blocks, + block_size, + block_ptrs, + pool ); +} + + + +void bli_mem_finalize() +{ + // Nothing to do. + +#ifdef BLIS_USE_HEAP + bli_free( pool_mk_mem ); + bli_free( pool_kn_mem ); + bli_free( pool_mn_mem ); +#endif + +} + diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index 8fc6d85d6..59572ad41 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -36,6 +36,7 @@ void bli_mem_init( void ); void bli_mem_finalize( void ); bool_t bli_mem_is_initialized( void ); +// ----------------------------------------------------------------------------- void bli_mem_acquire_m( siz_t req_size, packbuf_t buf_type, @@ -46,9 +47,16 @@ void bli_mem_acquire_v( siz_t req_size, void bli_mem_release( mem_t* mem ); -void bli_mem_init_pool( char* pool_mem, - siz_t block_size, - dim_t n_blocks, - void** block_ptrs, - pool_t* pool_struct ); +siz_t bli_mem_pool_size( packbuf_t buf_type ); + +// ----------------------------------------------------------------------------- + +void bli_mem_init_pools( void ); +void bli_mem_finalize_pools( void ); + +void bli_mem_compute_pool_block_sizes_dt( num_t dt, + ind_t method, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c ); diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c new file mode 100644 index 000000000..9c7230faa --- /dev/null +++ b/frame/base/bli_pool.c @@ -0,0 +1,344 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_pool_init( dim_t num_blocks_init, + siz_t block_size, + siz_t align_size, + pool_t* pool ) +{ + pblk_t* block_ptrs; + dim_t i; + + // Allocate the block_ptrs array. + block_ptrs = bli_malloc( num_blocks_init * sizeof( pblk_t ) ); + + // Allocate and initialize each entry in the block_ptrs array. + for ( i = 0; i < num_blocks_init; ++i ) + { + bli_pool_alloc_block( block_size, align_size, &(block_ptrs[i]) ); + } + + // Initialize the pool_t structure. + bli_pool_set_block_ptrs( block_ptrs, pool ); + bli_pool_set_block_ptrs_len( num_blocks_init, pool ); + bli_pool_set_num_blocks( num_blocks_init, pool ); + bli_pool_set_top_index( 0, pool ); + bli_pool_set_block_size( block_size, pool ); + bli_pool_set_align_size( align_size, pool ); +} + +void bli_pool_finalize( pool_t* pool ) +{ + pblk_t* block_ptrs; + dim_t num_blocks; + dim_t i; + + // NOTE: This implementation assumes that all blocks have been + // checked in by all threads. + + // Query the current block_ptrs array and total number of blocks + // presently allocated. + block_ptrs = bli_pool_block_ptrs( pool ); + num_blocks = bli_pool_num_blocks( pool ); + + // Free the individual blocks. + for ( i = 0; i < num_blocks; ++i ) + { + bli_pool_free_block( &(block_ptrs[i]) ); + } + + // Free the block_ptrs array. + bli_free( block_ptrs ); +} + +void bli_pool_checkout_block( pblk_t* block, pool_t* pool ) +{ + pblk_t* block_ptrs; + dim_t top_index; + + // If the pool is exhausted, add a block. + if ( bli_pool_is_exhausted( pool ) ) + { + bli_pool_grow( 1, pool ); + } + + // At this point, at least one block is guaranteed to be available. + + // Query the current block_ptrs array. + block_ptrs = bli_pool_block_ptrs( pool ); + + // Query the top_index of the pool. + top_index = bli_pool_top_index( pool ); + + // Copy the block at top_index to the caller's pblk_t struct. + //bli_pblk_copy( *(block_ptrs[top_index]), *block ); + *block = block_ptrs[top_index]; + + // Notice that we don't actually need to clear the contents of + // block_ptrs[top_index]. It will get overwritten eventually when + // the block is checked back in. + bli_pblk_clear( &block_ptrs[top_index] ); + + // Increment the pool's top_index. + bli_pool_set_top_index( top_index + 1, pool ); +} + +void bli_pool_checkin_block( pblk_t* block, pool_t* pool ) +{ + pblk_t* block_ptrs; + dim_t top_index; + + // Query the current block_ptrs array. + block_ptrs = bli_pool_block_ptrs( pool ); + + // Query the top_index of the pool. + top_index = bli_pool_top_index( pool ); + + // Copy the caller's pblk_t struct to the block at top_index - 1. + //bli_pblk_copy( *(block_ptrs[top_index-1]), *block ); + block_ptrs[top_index-1] = *block; + + // Decrement the pool's top_index. + bli_pool_set_top_index( top_index - 1, pool ); +} + +void bli_pool_grow( dim_t num_blocks_add, pool_t* pool ) +{ + pblk_t* block_ptrs_cur; + dim_t block_ptrs_len_cur; + dim_t num_blocks_cur; + + pblk_t* block_ptrs_new; + dim_t num_blocks_new; + + siz_t block_size; + siz_t align_size; + dim_t top_index; + + dim_t i; + + // If the requested increase is zero (or negative), return early. + if ( num_blocks_add < 1 ) return; + + // Query the allocated length of the block_ptrs array and also the + // total number of blocks allocated. + block_ptrs_len_cur = bli_pool_block_ptrs_len( pool ); + num_blocks_cur = bli_pool_num_blocks( pool ); + + // Compute the total number of allocated blocks that will exist + // after we grow the pool. + num_blocks_new = num_blocks_cur + num_blocks_add; + + // If the new total number of allocated blocks is larger than the + // allocated length of the block_ptrs array, we need to allocate + // a new (larger) block_ptrs array. + if ( num_blocks_new > block_ptrs_len_cur ) + { + // Query the current block_ptrs array. + block_ptrs_cur = bli_pool_block_ptrs( pool ); + + // Allocate a new block_ptrs array of length num_blocks_new. + block_ptrs_new = bli_malloc( num_blocks_new * sizeof( pblk_t ) ); + + // Query the top_index of the pool. + top_index = bli_pool_top_index( pool ); + + // Copy the contents of the old block_ptrs array to the new/resized + // array. Notice that we can begin with top_index since all entries + // from 0 to top_index-1 have been checked out to threads. + for ( i = top_index; i < num_blocks_cur; ++i ) + { +//printf( "bli_pool_grow: copying from %lu\n", top_index ); + block_ptrs_new[i] = block_ptrs_cur[i]; + } + +//printf( "bli_pool_grow: bp_cur: %p\n", block_ptrs_cur ); + // Free the old block_ptrs array. + bli_free( block_ptrs_cur ); + + // Update the pool_t struct with the new block_ptrs array and + // record its allocated length. + bli_pool_set_block_ptrs( block_ptrs_new, pool ); + bli_pool_set_block_ptrs_len( num_blocks_new, pool ); + } + + // At this point, we are guaranteed to have enough unused elements + // in the block_ptrs array to accommodate an additional num_blocks_add + // blocks. + + // Query the current block_ptrs array (which was possibly just resized). + block_ptrs_cur = bli_pool_block_ptrs( pool ); + + // Query the block size and alignment size of the current pool. + block_size = bli_pool_block_size( pool ); + align_size = bli_pool_align_size( pool ); + + // Allocate the requested additional blocks in the resized array. + for ( i = num_blocks_cur; i < num_blocks_new; ++i ) + { +//printf( "libblis: growing pool, block_size = %lu\n", block_size ); fflush( stdout ); + + bli_pool_alloc_block( block_size, align_size, &(block_ptrs_cur[i]) ); + } + + // Update the pool_t struct with the new number of allocated blocks. + // Notice that top_index remains unchanged, as do the block_size and + // align_size fields. + bli_pool_set_num_blocks( num_blocks_new, pool ); +} + +void bli_pool_shrink( dim_t num_blocks_sub, pool_t* pool ) +{ + pblk_t* block_ptrs; + dim_t num_blocks; + dim_t num_blocks_avail; + dim_t num_blocks_new; + + dim_t top_index; + + dim_t i; + + // Query the total number of blocks presently allocated. + num_blocks = bli_pool_num_blocks( pool ); + + // Query the top_index of the pool. + top_index = bli_pool_top_index( pool ); + + // Compute the number of blocks available to be checked out + // (and thus available for removal). + num_blocks_avail = num_blocks - top_index; + + // If the requested decrease is more than the number of available + // blocks in the pool, only remove the number of blocks available. + if ( num_blocks_avail < num_blocks_sub ) + num_blocks_sub = num_blocks_avail; + + // If the effective requested decrease is zero (or the requested + // decrease was negative), return early. + if ( num_blocks_sub < 1 ) return; + + // Query the current block_ptrs array. + block_ptrs = bli_pool_block_ptrs( pool ); + + // Compute the new total number of blocks. + num_blocks_new = num_blocks - num_blocks_sub; + + // Free the individual blocks. + for ( i = num_blocks_new; i < num_blocks; ++i ) + { + bli_pool_free_block( &(block_ptrs[i]) ); + } + + // Update the pool_t struct. + bli_pool_set_num_blocks( num_blocks_new, pool ); + + // Note that after shrinking the pool, num_blocks < block_ptrs_len. + // This means the pool can grow again by num_blocks_sub before + // a re-allocation of block_ptrs is triggered. +} + +void bli_pool_alloc_block( siz_t block_size, + siz_t align_size, + pblk_t* block ) +{ + void* buf_sys; + void* buf_align; + + // Allocate the block. We add the alignment size to ensure we will + // have enough usable space after alignment. + buf_sys = bli_malloc( block_size + align_size ); + buf_align = buf_sys; + + // Advance the pointer to achieve the necessary alignment, if it + // is not already aligned. + if ( bli_is_unaligned_to( ( uintptr_t )buf_sys, ( uintptr_t )align_size ) ) + { + // Notice that this works even if the alignment is not a power of two. + buf_align += ( ( uintptr_t )align_size - + ( ( uintptr_t )buf_sys % align_size ) ); + } + + // Save the results in the pblk_t structure. + bli_pblk_set_buf_sys( buf_sys, block ); + bli_pblk_set_buf_align( buf_align, block ); +} + +void bli_pool_free_block( pblk_t* block ) +{ + void* buf_sys; + + // Extract the pointer to the block that was originally provided by + // the operating system. + buf_sys = bli_pblk_buf_sys( block ); + + // Free the block. + bli_free( buf_sys ); +} + +void bli_pool_print( pool_t* pool ) +{ + pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); + dim_t block_ptrs_len = bli_pool_block_ptrs_len( pool ); + dim_t top_index = bli_pool_top_index( pool ); + dim_t num_blocks = bli_pool_num_blocks( pool ); + dim_t block_size = bli_pool_block_size( pool ); + dim_t align_size = bli_pool_align_size( pool ); + dim_t i; + + printf( "pool struct ---------------\n" ); + printf( " block_ptrs: %p\n", block_ptrs ); + printf( " block_ptrs_len: %ld\n", block_ptrs_len ); + printf( " top_index: %ld\n", top_index ); + printf( " num_blocks: %ld\n", num_blocks ); + printf( " block_size: %ld\n", block_size ); + printf( " align_size: %ld\n", align_size ); + printf( " pblks sys align\n" ); + for ( i = 0; i < num_blocks; ++i ) + { + printf( " %ld: %p %p\n", i, bli_pblk_buf_sys( &block_ptrs[i] ), + bli_pblk_buf_align( &block_ptrs[i] ) ); + } +} + +void bli_pblk_print( pblk_t* pblk ) +{ + void* buf_sys = bli_pblk_buf_sys( pblk ); + void* buf_align = bli_pblk_buf_align( pblk ); + + printf( "pblk struct ---------------\n" ); + printf( " %p %p\n", buf_sys, buf_align ); +} + diff --git a/frame/include/bli_pool_macro_defs.h b/frame/base/bli_pool.h similarity index 54% rename from frame/include/bli_pool_macro_defs.h rename to frame/base/bli_pool.h index c958595ae..611c0feb0 100644 --- a/frame/include/bli_pool_macro_defs.h +++ b/frame/base/bli_pool.h @@ -32,8 +32,63 @@ */ -#ifndef BLIS_POOL_MACRO_DEFS_H -#define BLIS_POOL_MACRO_DEFS_H +#ifndef BLIS_POOL_H +#define BLIS_POOL_H + +// -- Pool block type -- + +/* +typedef struct +{ + void* buf_sys; + void* buf_align; +} pblk_t; +*/ + +// -- Pool type -- + +/* +typedef struct +{ + pblk_t* block_ptrs; + dim_t block_ptrs_len; + + dim_t top_index; + dim_t num_blocks; + + siz_t block_size; + siz_t align_size; +} pool_t; +*/ + + +// Pool block query + +#define bli_pblk_buf_sys( pblk_p ) \ +\ + ( (pblk_p)->buf_sys ) + +#define bli_pblk_buf_align( pblk_p ) \ +\ + ( (pblk_p)->buf_align ) + +// Pool block modification + +#define bli_pblk_set_buf_sys( buf_sys0, pblk_p ) \ +{ \ + (pblk_p)->buf_sys = buf_sys0; \ +} + +#define bli_pblk_set_buf_align( buf_align0, pblk_p ) \ +{ \ + (pblk_p)->buf_align = buf_align0; \ +} + +#define bli_pblk_clear( pblk_p ) \ +{ \ + bli_pblk_set_buf_sys( NULL, pblk_p ); \ + bli_pblk_set_buf_align( NULL, pblk_p ); \ +} // Pool entry query @@ -42,6 +97,10 @@ \ ( (pool_p)->block_ptrs ) +#define bli_pool_block_ptrs_len( pool_p ) \ +\ + ( (pool_p)->block_ptrs_len ) + #define bli_pool_num_blocks( pool_p ) \ \ ( (pool_p)->num_blocks ) @@ -50,14 +109,18 @@ \ ( (pool_p)->block_size ) +#define bli_pool_align_size( pool_p ) \ +\ + ( (pool_p)->align_size ) + #define bli_pool_top_index( pool_p ) \ \ ( (pool_p)->top_index ) #define bli_pool_is_exhausted( pool_p ) \ \ - ( bli_pool_top_index( pool_p ) == -1 ) - + ( bli_pool_top_index( pool_p ) == \ + bli_pool_num_blocks( pool_p ) ) // Pool entry modification @@ -66,6 +129,11 @@ (pool_p)->block_ptrs = block_ptrs0; \ } +#define bli_pool_set_block_ptrs_len( block_ptrs_len0, pool_p ) \ +{ \ + (pool_p)->block_ptrs_len = block_ptrs_len0; \ +} + #define bli_pool_set_num_blocks( num_blocks0, pool_p ) \ { \ (pool_p)->num_blocks = num_blocks0; \ @@ -76,28 +144,37 @@ (pool_p)->block_size = block_size0; \ } +#define bli_pool_set_align_size( align_size0, pool_p ) \ +{ \ + (pool_p)->align_size = align_size0; \ +} + #define bli_pool_set_top_index( top_index0, pool_p ) \ { \ (pool_p)->top_index = top_index0; \ } -#define bli_pool_dec_top_index( pool_p ) \ -{ \ - ((pool_p)->top_index)--; \ -} +#endif -#define bli_pool_inc_top_index( pool_p ) \ -{ \ - ((pool_p)->top_index)++; \ -} +// ----------------------------------------------------------------------------- -#define bli_pool_init( num_blocks, block_size, block_ptrs, pool_p ) \ -{ \ - bli_pool_set_num_blocks( num_blocks, pool_p ); \ - bli_pool_set_block_size( block_size, pool_p ); \ - bli_pool_set_block_ptrs( block_ptrs, pool_p ); \ - bli_pool_set_top_index( num_blocks - 1, pool_p ); \ -} +void bli_pool_init( dim_t num_blocks_init, + siz_t block_size, + siz_t align_size, + pool_t* pool ); +void bli_pool_finalize( pool_t* pool ); +void bli_pool_checkout_block( pblk_t* block, pool_t* pool ); +void bli_pool_checkin_block( pblk_t* block, pool_t* pool ); + +void bli_pool_grow( dim_t num_blocks_add, pool_t* pool ); +void bli_pool_shrink( dim_t num_blocks_sub, pool_t* pool ); + +void bli_pool_alloc_block( siz_t block_size, + siz_t align_size, + pblk_t* block ); +void bli_pool_free_block( pblk_t* block ); + +void bli_pool_print( pool_t* pool ); +void bli_pblk_print( pblk_t* pblk ); -#endif diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h new file mode 100644 index 000000000..3f33c7e66 --- /dev/null +++ b/frame/include/bli_config_macro_defs.h @@ -0,0 +1,178 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CONFIG_MACRO_DEFS_H +#define BLIS_CONFIG_MACRO_DEFS_H + + +// -- INTEGER PROPERTIES ------------------------------------------------------- + +// The bit size of the integer type used to track values such as dimensions, +// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed +// integers while 64 results in 64-bit integers. Any other value results in use +// of the C99 type "long int". Note that this ONLY affects integers used +// internally within BLIS as well as those exposed in the native BLAS-like BLIS +// interface. +#ifndef BLIS_INT_TYPE_SIZE +#define BLIS_INT_TYPE_SIZE 64 +#endif + + +// -- FLOATING-POINT PROPERTIES ------------------------------------------------ + +// Enable use of built-in C99 "float complex" and "double complex" types and +// associated overloaded operations and functions? Disabling results in +// scomplex and dcomplex being defined in terms of simple structs. +// NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. +#ifdef BLIS_ENABLE_C99_COMPLEX + // No additional definitions needed. +#else + // Default behavior is disabled. +#endif + + +// -- MULTITHREADING ----------------------------------------------------------- + +// Enable multithreading via POSIX threads. +#ifdef BLIS_ENABLE_PTHREADS + // No additional definitions needed. +#else + // Default behavior is disabled. +#endif + +// Enable multithreading via OpenMP. +#ifdef BLIS_ENABLE_OPENMP + // No additional definitions needed. +#else + // Default behavior is disabled. +#endif + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +// Size of a virtual memory page. This is used to align certain memory +// buffers which are allocated and used internally. +#ifndef BLIS_PAGE_SIZE +#define BLIS_PAGE_SIZE 4096 +#endif + +// Alignment size (in bytes) needed by the instruction set for aligned +// SIMD/vector instructions. +#ifndef BLIS_SIMD_ALIGN_SIZE +#define BLIS_SIMD_ALIGN_SIZE 32 +#endif + +// Alignment size used to align local stack buffers within macro-kernel +// functions. +#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when allocating memory dynamically from the operating +// system (eg: posix_memalign()). To disable heap alignment and just use +// malloc() instead, set this to 1. +#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when sizing leading dimensions of dynamically +// allocated memory. +#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when allocating blocks to the internal memory +// pool (for packing buffers). +#define BLIS_POOL_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE + + +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Basic (homogeneous) datatype support always enabled. + +// AVOID ENABLING MIXED DATATYPE SUPPORT! IT IS PROBABLY BROKEN. + +// Enable mixed domain operations? +//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT + +// Enable extra mixed precision operations? +//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT + + +// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- + +// Stay initialized after auto-initialization, unless and until the user +// explicitly calls bli_finalize(). +#ifdef BLIS_DISABLE_STAY_AUTO_INITIALIZED + #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED +#else + // Default behavior is enabled. + #undef BLIS_ENABLE_STAY_AUTO_INITIALIZED // In case user explicitly enabled. + #define BLIS_ENABLE_STAY_AUTO_INITIALIZED +#endif + + +// -- BLAS COMPATIBILITY LAYER ------------------------------------------------- + +// Enable the BLAS compatibility layer? +#ifdef BLIS_DISABLE_BLAS2BLIS + #undef BLIS_ENABLE_BLAS2BLIS +#else + // Default behavior is enabled. + #undef BLIS_ENABLE_BLAS2BLIS // In case user explicitly enabled. + #define BLIS_ENABLE_BLAS2BLIS +#endif + +// The bit size of the integer type used to track values such as dimensions and +// leading dimensions (ie: column strides) within the BLAS compatibility layer. +// A value of 32 results in the compatibility layer using 32-bit signed integers +// while 64 results in 64-bit integers. Any other value results in use of the +// C99 type "long int". Note that this ONLY affects integers used within the +// BLAS compatibility layer. +#ifndef BLIS_BLAS2BLIS_INT_TYPE_SIZE +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64 +#endif + + +// -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ + +// Enable the CBLAS compatibility layer? +// NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer +// regardless of whether or not it was explicitly enabled above. Furthermore, +// the CBLAS compatibility layer will use the integer type size definition +// specified above when defining the size of its own integers (regardless of +// whether the BLAS layer was enabled directly or indirectly). +#ifdef BLIS_ENABLE_CBLAS + // No additional definitions needed. +#else + // Default behavior is disabled. +#endif + + +#endif + diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index e94b818c2..5622a098c 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -1245,41 +1245,6 @@ #endif -// -- Define micro-panel alignment --------------------------------------------- - -// In this section, we consider each datatype-specific alignment sizes for -// micro-panels of A and B. If any definition is undefined, we define it to -// a safe default value (the size of the datatype). - -// Alignment for micro-panels of A -#ifndef BLIS_UPANEL_A_ALIGN_SIZE_S -#define BLIS_UPANEL_A_ALIGN_SIZE_S BLIS_SIZEOF_S -#endif -#ifndef BLIS_UPANEL_A_ALIGN_SIZE_D -#define BLIS_UPANEL_A_ALIGN_SIZE_D BLIS_SIZEOF_D -#endif -#ifndef BLIS_UPANEL_A_ALIGN_SIZE_C -#define BLIS_UPANEL_A_ALIGN_SIZE_C BLIS_SIZEOF_C -#endif -#ifndef BLIS_UPANEL_A_ALIGN_SIZE_Z -#define BLIS_UPANEL_A_ALIGN_SIZE_Z BLIS_SIZEOF_Z -#endif - -// Alignment for micro-panels of B -#ifndef BLIS_UPANEL_B_ALIGN_SIZE_S -#define BLIS_UPANEL_B_ALIGN_SIZE_S BLIS_SIZEOF_S -#endif -#ifndef BLIS_UPANEL_B_ALIGN_SIZE_D -#define BLIS_UPANEL_B_ALIGN_SIZE_D BLIS_SIZEOF_D -#endif -#ifndef BLIS_UPANEL_B_ALIGN_SIZE_C -#define BLIS_UPANEL_B_ALIGN_SIZE_C BLIS_SIZEOF_C -#endif -#ifndef BLIS_UPANEL_B_ALIGN_SIZE_Z -#define BLIS_UPANEL_B_ALIGN_SIZE_Z BLIS_SIZEOF_Z -#endif - - // -- Kernel blocksize checks -------------------------------------------------- // Verify that cache blocksizes are whole multiples of register blocksizes. diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 4611df4f9..907a07d83 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -92,6 +92,11 @@ #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) +// Fortran-77 name-mangling macros. +#define PASTEF770(name) name ## _ +#define PASTEF77(ch1,name) ch1 ## name ## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ +#define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros @@ -100,7 +105,6 @@ #include "bli_gentprot_macro_defs.h" #include "bli_mem_macro_defs.h" -#include "bli_pool_macro_defs.h" #include "bli_obj_macro_defs.h" #include "bli_param_macro_defs.h" #include "bli_complex_macro_defs.h" diff --git a/frame/include/bli_mem_macro_defs.h b/frame/include/bli_mem_macro_defs.h index b603fdc63..51840b712 100644 --- a/frame/include/bli_mem_macro_defs.h +++ b/frame/include/bli_mem_macro_defs.h @@ -38,17 +38,25 @@ // Mem entry query +#define bli_mem_pblk( mem_p ) \ +\ + ( &((mem_p)->pblk) ) + #define bli_mem_buffer( mem_p ) \ \ - ( (mem_p)->buf ) + ( bli_pblk_buf_align( bli_mem_pblk( mem_p ) ) ) + +#define bli_mem_buf_sys( mem_p ) \ +\ + ( bli_pblk_buf_sys( bli_mem_pblk( mem_p ) ) ) #define bli_mem_buf_type( mem_p ) \ \ - ( (mem_p)->buf_type ) + ( (mem_p)->buf_type ) #define bli_mem_pool( mem_p ) \ \ - ( (mem_p)->pool ) + ( (mem_p)->pool ) #define bli_mem_size( mem_p ) \ \ @@ -65,24 +73,42 @@ // Mem entry modification +#define bli_mem_set_pblk( pblk_p, mem_p ) \ +{ \ + mem_p->pblk = *(pblk_p); \ +} + #define bli_mem_set_buffer( buf0, mem_p ) \ { \ - mem_p->buf = buf0; \ + bli_pblk_set_buf_align( buf0, &(mem_p->pblk) ); \ +} + +#define bli_mem_set_buf_sys( buf0, mem_p ) \ +{ \ + bli_pblk_set_buf_sys( buf0, &(mem_p->pblk) ); \ } #define bli_mem_set_buf_type( buf_type0, mem_p ) \ { \ - mem_p->buf_type = buf_type0; \ + mem_p->buf_type = buf_type0; \ } #define bli_mem_set_pool( pool0, mem_p ) \ { \ - mem_p->pool = pool0; \ + mem_p->pool = pool0; \ } #define bli_mem_set_size( size0, mem_p ) \ { \ - mem_p->size = size0; \ + mem_p->size = size0; \ +} + +#define bli_mem_clear( mem_p ) \ +{ \ + bli_mem_set_buffer( NULL, mem_p ); \ + bli_mem_set_buf_sys( NULL, mem_p ); \ + bli_mem_set_pool( NULL, mem_p ); \ + bli_mem_set_size( 0, mem_p ); \ } diff --git a/frame/include/bli_mem_pool_macro_defs.h b/frame/include/bli_mem_pool_macro_defs.h deleted file mode 100644 index 6b3a6bc81..000000000 --- a/frame/include/bli_mem_pool_macro_defs.h +++ /dev/null @@ -1,535 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_POOL_BLOCKS_MACRO_DEFS_H -#define BLIS_POOL_BLOCKS_MACRO_DEFS_H - - -// -- Memory pool block sizing macros ------------------------------------------ - -// In this file, we compute the memory pool block sizes for A, B, and C for -// each floating-point datatype, and then search for and save the maximum. -// The reason we settle on the largest is to prevent a developer from -// implementing a micro-kernel for one datatype (say, single real) and then -// end up in a situation where the memory pool is not large enough because -// the cache blocksize value of the datatype used to size the pool (e.g. -// double) was not set accordingly. - -// First we compute possible scaling factors for each datatype. These -// scaling factors actually take the form of numerator and denominator -// since we want stay in integer arithmetic. The purpose of the scaling -// factors is to increase the amount of space we reserve for the memory -// pool blocks if one of the packed micro-panels has a "leading dimension" -// that is larger than the register blocksize. (In this case, the leading -// dimension of a micro-panel is the packing register blocksize.) - -// Note that when computing the scaling factor, we have to determine which -// of PACKDIM_MR/DEFAULT_MR and PACKDIM_NR/DEFAULT_NR is greater so that -// the pair of values can be used to scale MAXIMUM_MC and MAXIMUM_NC. This -// is needed ONLY because the amount of space allocated for a block of A -// and a panel of B needs to be such that MR and NR can be swapped (ie: A -// is packed with NR and B is packed with MR). This transformation is -// needed for right-side trsm when inducing an algorithm that (a) has -// favorable access patterns for column-stored C and (b) allows the -// macro-kernel to reuse the existing left-side fused gemmtrsm micro-kernels. -// We cross-multiply so that the comparison can stay in integer arithmetic. - - -// -// Find the larger register blocksize for each datatype. -// -#if BLIS_DEFAULT_MR_S > BLIS_DEFAULT_NR_S -#define BLIS_MAX_MNR_S BLIS_DEFAULT_MR_S -#else -#define BLIS_MAX_MNR_S BLIS_DEFAULT_NR_S -#endif -#if BLIS_DEFAULT_MR_D > BLIS_DEFAULT_NR_D -#define BLIS_MAX_MNR_D BLIS_DEFAULT_MR_D -#else -#define BLIS_MAX_MNR_D BLIS_DEFAULT_NR_D -#endif -#if BLIS_DEFAULT_MR_C > BLIS_DEFAULT_NR_C -#define BLIS_MAX_MNR_C BLIS_DEFAULT_MR_C -#else -#define BLIS_MAX_MNR_C BLIS_DEFAULT_NR_C -#endif -#if BLIS_DEFAULT_MR_Z > BLIS_DEFAULT_NR_Z -#define BLIS_MAX_MNR_Z BLIS_DEFAULT_MR_Z -#else -#define BLIS_MAX_MNR_Z BLIS_DEFAULT_NR_Z -#endif - -// -// Define local maximum cache blocksizes -// - -// NOTE: We define these values here just to more concisely capture the -// increasing of the kc dimension blocksizes by the maximum register -// blocksize, which we do to make room for the nudging up of kc at -// runtime to be a multiple of MR or NR for triangular operations trmm, -// trmm3, and trsm. Also, we divide the induced values by 2 since they are -// defined in terms of real elements, but used (later, when computing -// pool block sizes) in terms of complex elements. - -#define BLIS_MAXIMUM_ASM_MC_S (BLIS_MAXIMUM_MC_S) -#define BLIS_MAXIMUM_ASM_KC_S ((BLIS_MAXIMUM_KC_S + BLIS_MAX_MNR_S)/2) -#define BLIS_MAXIMUM_ASM_NC_S (BLIS_MAXIMUM_NC_S) - -#define BLIS_MAXIMUM_ASM_MC_D (BLIS_MAXIMUM_MC_D) -#define BLIS_MAXIMUM_ASM_KC_D ((BLIS_MAXIMUM_KC_D + BLIS_MAX_MNR_D)/2) -#define BLIS_MAXIMUM_ASM_NC_D (BLIS_MAXIMUM_NC_D) - -#define BLIS_MAXIMUM_ASM_MC_C (BLIS_MAXIMUM_MC_C) -#define BLIS_MAXIMUM_ASM_KC_C ((BLIS_MAXIMUM_KC_C + BLIS_MAX_MNR_C)/2) -#define BLIS_MAXIMUM_ASM_NC_C (BLIS_MAXIMUM_NC_C) - -#define BLIS_MAXIMUM_ASM_MC_Z (BLIS_MAXIMUM_MC_Z) -#define BLIS_MAXIMUM_ASM_KC_Z ((BLIS_MAXIMUM_KC_Z + BLIS_MAX_MNR_Z)/2) -#define BLIS_MAXIMUM_ASM_NC_Z (BLIS_MAXIMUM_NC_Z) - -#define BLIS_MAXIMUM_IND_MC_C (BLIS_MAXIMUM_MC_S) -#define BLIS_MAXIMUM_IND_KC_C ((BLIS_MAXIMUM_KC_S + BLIS_MAX_MNR_S)/2) -#define BLIS_MAXIMUM_IND_NC_C (BLIS_MAXIMUM_NC_S) - -#define BLIS_MAXIMUM_IND_MC_Z (BLIS_MAXIMUM_MC_D) -#define BLIS_MAXIMUM_IND_KC_Z ((BLIS_MAXIMUM_KC_D + BLIS_MAX_MNR_D)/2) -#define BLIS_MAXIMUM_IND_NC_Z (BLIS_MAXIMUM_NC_D) - - -// -// Compute scaling factors for single real. -// -#if ( BLIS_PACKDIM_MR_S * BLIS_DEFAULT_NR_S ) >= \ - ( BLIS_PACKDIM_NR_S * BLIS_DEFAULT_MR_S ) - #define BLIS_PACKDIM_MAXR_S BLIS_PACKDIM_MR_S - #define BLIS_DEFAULT_MAXR_S BLIS_DEFAULT_MR_S -#else - #define BLIS_PACKDIM_MAXR_S BLIS_PACKDIM_NR_S - #define BLIS_DEFAULT_MAXR_S BLIS_DEFAULT_NR_S -#endif - -// -// Compute scaling factors for double real. -// -#if ( BLIS_PACKDIM_MR_D * BLIS_DEFAULT_NR_D ) >= \ - ( BLIS_PACKDIM_NR_D * BLIS_DEFAULT_MR_D ) - #define BLIS_PACKDIM_MAXR_D BLIS_PACKDIM_MR_D - #define BLIS_DEFAULT_MAXR_D BLIS_DEFAULT_MR_D -#else - #define BLIS_PACKDIM_MAXR_D BLIS_PACKDIM_NR_D - #define BLIS_DEFAULT_MAXR_D BLIS_DEFAULT_NR_D -#endif - -// -// Compute scaling factors for single complex. -// -#if ( BLIS_PACKDIM_MR_C * BLIS_DEFAULT_NR_C ) >= \ - ( BLIS_PACKDIM_NR_C * BLIS_DEFAULT_MR_C ) - #define BLIS_PACKDIM_MAXR_C BLIS_PACKDIM_MR_C - #define BLIS_DEFAULT_MAXR_C BLIS_DEFAULT_MR_C -#else - #define BLIS_PACKDIM_MAXR_C BLIS_PACKDIM_NR_C - #define BLIS_DEFAULT_MAXR_C BLIS_DEFAULT_NR_C -#endif - -// -// Compute scaling factors for double complex. -// -#if ( BLIS_PACKDIM_MR_Z * BLIS_DEFAULT_NR_Z ) >= \ - ( BLIS_PACKDIM_NR_Z * BLIS_DEFAULT_MR_Z ) - #define BLIS_PACKDIM_MAXR_Z BLIS_PACKDIM_MR_Z - #define BLIS_DEFAULT_MAXR_Z BLIS_DEFAULT_MR_Z -#else - #define BLIS_PACKDIM_MAXR_Z BLIS_PACKDIM_NR_Z - #define BLIS_DEFAULT_MAXR_Z BLIS_DEFAULT_NR_Z -#endif - - -// Next, we define the dimensions of the pool blocks for each datatype. - -// -// Compute pool dimensions for single real -// -#define BLIS_POOL_ASM_MC_S ( ( BLIS_MAXIMUM_ASM_MC_S * BLIS_PACKDIM_MAXR_S ) \ - / BLIS_DEFAULT_MAXR_S ) -#define BLIS_POOL_ASM_NC_S ( ( BLIS_MAXIMUM_ASM_NC_S * BLIS_PACKDIM_MAXR_S ) \ - / BLIS_DEFAULT_MAXR_S ) -#define BLIS_POOL_ASM_KC_S ( ( BLIS_MAXIMUM_ASM_KC_S * BLIS_PACKDIM_KR_S ) \ - / BLIS_DEFAULT_KR_S ) - -// -// Compute pool dimensions for double real -// -#define BLIS_POOL_ASM_MC_D ( ( BLIS_MAXIMUM_ASM_MC_D * BLIS_PACKDIM_MAXR_D ) \ - / BLIS_DEFAULT_MAXR_D ) -#define BLIS_POOL_ASM_NC_D ( ( BLIS_MAXIMUM_ASM_NC_D * BLIS_PACKDIM_MAXR_D ) \ - / BLIS_DEFAULT_MAXR_D ) -#define BLIS_POOL_ASM_KC_D ( ( BLIS_MAXIMUM_ASM_KC_D * BLIS_PACKDIM_KR_D ) \ - / BLIS_DEFAULT_KR_D ) - -// -// Compute pool dimensions for single complex (native) -// -#define BLIS_POOL_ASM_MC_C ( ( BLIS_MAXIMUM_ASM_MC_C * BLIS_PACKDIM_MAXR_C ) \ - / BLIS_DEFAULT_MAXR_C ) -#define BLIS_POOL_ASM_NC_C ( ( BLIS_MAXIMUM_ASM_NC_C * BLIS_PACKDIM_MAXR_C ) \ - / BLIS_DEFAULT_MAXR_C ) -#define BLIS_POOL_ASM_KC_C ( ( BLIS_MAXIMUM_ASM_KC_C * BLIS_PACKDIM_KR_C ) \ - / BLIS_DEFAULT_KR_C ) - -// -// Compute pool dimensions for double complex (native) -// -#define BLIS_POOL_ASM_MC_Z ( ( BLIS_MAXIMUM_ASM_MC_Z * BLIS_PACKDIM_MAXR_Z ) \ - / BLIS_DEFAULT_MAXR_Z ) -#define BLIS_POOL_ASM_NC_Z ( ( BLIS_MAXIMUM_ASM_NC_Z * BLIS_PACKDIM_MAXR_Z ) \ - / BLIS_DEFAULT_MAXR_Z ) -#define BLIS_POOL_ASM_KC_Z ( ( BLIS_MAXIMUM_ASM_KC_Z * BLIS_PACKDIM_KR_Z ) \ - / BLIS_DEFAULT_KR_Z ) - -// -// Compute pool dimensions for single complex (induced) -// -#define BLIS_POOL_IND_MC_C ( ( BLIS_MAXIMUM_IND_MC_C * BLIS_PACKDIM_MAXR_S ) \ - / BLIS_DEFAULT_MAXR_S ) -#define BLIS_POOL_IND_NC_C ( ( BLIS_MAXIMUM_IND_NC_C * BLIS_PACKDIM_MAXR_S ) \ - / BLIS_DEFAULT_MAXR_S ) -#define BLIS_POOL_IND_KC_C ( ( BLIS_MAXIMUM_IND_KC_C * BLIS_PACKDIM_KR_S ) \ - / BLIS_DEFAULT_KR_S ) - -// -// Compute pool dimensions for double complex (induced) -// -#define BLIS_POOL_IND_MC_Z ( ( BLIS_MAXIMUM_IND_MC_Z * BLIS_PACKDIM_MAXR_D ) \ - / BLIS_DEFAULT_MAXR_D ) -#define BLIS_POOL_IND_NC_Z ( ( BLIS_MAXIMUM_IND_NC_Z * BLIS_PACKDIM_MAXR_D ) \ - / BLIS_DEFAULT_MAXR_D ) -#define BLIS_POOL_IND_KC_Z ( ( BLIS_MAXIMUM_IND_KC_Z * BLIS_PACKDIM_KR_D ) \ - / BLIS_DEFAULT_KR_D ) - - -// Now, we compute the size of each block/panel of A, B, and C for each -// datatype. - -// NOTE: We assume the worst case of unit register blocksizes, and -// therefore add a full micro-panel alignment value to KC. This can -// result in quite a bit of unused space, but it's better than the -// alternative of being bitten by the absolute black plague that -// would result from overflowing a block within the pool. - -// -// Compute memory pool block sizes for single real. -// - -#define BLIS_MK_BLOCK_SIZE_ASM_S ( BLIS_POOL_ASM_MC_S * \ - ( BLIS_POOL_ASM_KC_S + \ - ( BLIS_UPANEL_A_ALIGN_SIZE_S / \ - BLIS_SIZEOF_S ) \ - ) * \ - BLIS_SIZEOF_S \ - ) -#define BLIS_KN_BLOCK_SIZE_ASM_S ( \ - ( BLIS_POOL_ASM_KC_S + \ - ( BLIS_UPANEL_B_ALIGN_SIZE_S / \ - BLIS_SIZEOF_S ) \ - ) * \ - BLIS_POOL_ASM_NC_S * \ - BLIS_SIZEOF_S \ - ) -#define BLIS_MN_BLOCK_SIZE_ASM_S ( BLIS_POOL_ASM_MC_S * \ - BLIS_POOL_ASM_NC_S * \ - BLIS_SIZEOF_S \ - ) - -// -// Compute memory pool block sizes for double real. -// - -#define BLIS_MK_BLOCK_SIZE_ASM_D ( BLIS_POOL_ASM_MC_D * \ - ( BLIS_POOL_ASM_KC_D + \ - ( BLIS_UPANEL_A_ALIGN_SIZE_D / \ - BLIS_SIZEOF_D ) \ - ) * \ - BLIS_SIZEOF_D \ - ) -#define BLIS_KN_BLOCK_SIZE_ASM_D ( \ - ( BLIS_POOL_ASM_KC_D + \ - ( BLIS_UPANEL_B_ALIGN_SIZE_D / \ - BLIS_SIZEOF_D ) \ - ) * \ - BLIS_POOL_ASM_NC_D * \ - BLIS_SIZEOF_D \ - ) -#define BLIS_MN_BLOCK_SIZE_ASM_D ( BLIS_POOL_ASM_MC_D * \ - BLIS_POOL_ASM_NC_D * \ - BLIS_SIZEOF_D \ - ) - -// -// Compute memory pool block sizes for single complex. -// - -#define BLIS_MK_BLOCK_SIZE_ASM_C ( BLIS_POOL_ASM_MC_C * \ - ( BLIS_POOL_ASM_KC_C + \ - ( BLIS_UPANEL_A_ALIGN_SIZE_C / \ - BLIS_SIZEOF_C ) \ - ) * \ - BLIS_SIZEOF_C \ - ) -#define BLIS_KN_BLOCK_SIZE_ASM_C ( \ - ( BLIS_POOL_ASM_KC_C + \ - ( BLIS_UPANEL_B_ALIGN_SIZE_C / \ - BLIS_SIZEOF_C ) \ - ) * \ - BLIS_POOL_ASM_NC_C * \ - BLIS_SIZEOF_C \ - ) -#define BLIS_MN_BLOCK_SIZE_ASM_C ( BLIS_POOL_ASM_MC_C * \ - BLIS_POOL_ASM_NC_C * \ - BLIS_SIZEOF_C \ - ) - -// -// Compute memory pool block sizes for double complex. -// - -#define BLIS_MK_BLOCK_SIZE_ASM_Z ( BLIS_POOL_ASM_MC_Z * \ - ( BLIS_POOL_ASM_KC_Z + \ - ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \ - BLIS_SIZEOF_Z ) \ - ) * \ - BLIS_SIZEOF_Z \ - ) -#define BLIS_KN_BLOCK_SIZE_ASM_Z ( \ - ( BLIS_POOL_ASM_KC_Z + \ - ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \ - BLIS_SIZEOF_Z ) \ - ) * \ - BLIS_POOL_ASM_NC_Z * \ - BLIS_SIZEOF_Z \ - ) -#define BLIS_MN_BLOCK_SIZE_ASM_Z ( BLIS_POOL_ASM_MC_Z * \ - BLIS_POOL_ASM_NC_Z * \ - BLIS_SIZEOF_Z \ - ) - -// -// Compute memory pool block sizes for single complex (induced). -// - -// NOTE: We scale by 3/2 because 3m1 requires 50% more space than other -// algorithms. - -#define BLIS_MK_BLOCK_SIZE_IND_C ( BLIS_POOL_IND_MC_C * \ - ( BLIS_POOL_IND_KC_C + \ - ( BLIS_UPANEL_A_ALIGN_SIZE_C / \ - BLIS_SIZEOF_C ) \ - ) * \ - ( BLIS_SIZEOF_C * \ - 3 \ - ) / 2 \ - ) -#define BLIS_KN_BLOCK_SIZE_IND_C ( \ - ( BLIS_POOL_IND_KC_C + \ - ( BLIS_UPANEL_B_ALIGN_SIZE_C / \ - BLIS_SIZEOF_C ) \ - ) * \ - BLIS_POOL_IND_NC_C * \ - ( BLIS_SIZEOF_C * \ - 3 \ - ) / 2 \ - ) -#define BLIS_MN_BLOCK_SIZE_IND_C ( BLIS_POOL_IND_MC_C * \ - BLIS_POOL_IND_NC_C * \ - ( BLIS_SIZEOF_C * \ - 3 \ - ) / 2 \ - ) - -// -// Compute memory pool block sizes for double complex (induced). -// - -// NOTE: We scale by 3/2 because 3m1 requires 50% more space than other -// algorithms. - -#define BLIS_MK_BLOCK_SIZE_IND_Z ( BLIS_POOL_IND_MC_Z * \ - ( BLIS_POOL_IND_KC_Z + \ - ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \ - BLIS_SIZEOF_Z ) \ - ) * \ - ( BLIS_SIZEOF_Z * \ - 3 \ - ) / 2 \ - ) -#define BLIS_KN_BLOCK_SIZE_IND_Z ( \ - ( BLIS_POOL_IND_KC_Z + \ - ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \ - BLIS_SIZEOF_Z ) \ - ) * \ - BLIS_POOL_IND_NC_Z * \ - ( BLIS_SIZEOF_Z * \ - 3 \ - ) / 2 \ - ) -#define BLIS_MN_BLOCK_SIZE_IND_Z ( BLIS_POOL_IND_MC_Z * \ - BLIS_POOL_IND_NC_Z * \ - ( BLIS_SIZEOF_Z * \ - 3 \ - ) / 2 \ - ) - - -// -- Maximum block size search ------------------------------------------------ - -// In this section, we find the largest of each block size. - -// -// Find the largest block size for blocks of A. -// -#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_ASM_S -#if BLIS_MK_BLOCK_SIZE_ASM_D > BLIS_MK_BLOCK_SIZE -#undef BLIS_MK_BLOCK_SIZE -#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_ASM_D -#endif -#if BLIS_MK_BLOCK_SIZE_ASM_C > BLIS_MK_BLOCK_SIZE -#undef BLIS_MK_BLOCK_SIZE -#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_ASM_C -#endif -#if BLIS_MK_BLOCK_SIZE_ASM_Z > BLIS_MK_BLOCK_SIZE -#undef BLIS_MK_BLOCK_SIZE -#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_ASM_Z -#endif -#if BLIS_MK_BLOCK_SIZE_IND_C > BLIS_MK_BLOCK_SIZE -#undef BLIS_MK_BLOCK_SIZE -#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_IND_C -#endif -#if BLIS_MK_BLOCK_SIZE_IND_Z > BLIS_MK_BLOCK_SIZE -#undef BLIS_MK_BLOCK_SIZE -#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_IND_Z -#endif - -// -// Find the largest block size for panels of B. -// -#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_ASM_S -#if BLIS_KN_BLOCK_SIZE_ASM_D > BLIS_KN_BLOCK_SIZE -#undef BLIS_KN_BLOCK_SIZE -#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_ASM_D -#endif -#if BLIS_KN_BLOCK_SIZE_ASM_C > BLIS_KN_BLOCK_SIZE -#undef BLIS_KN_BLOCK_SIZE -#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_ASM_C -#endif -#if BLIS_KN_BLOCK_SIZE_ASM_Z > BLIS_KN_BLOCK_SIZE -#undef BLIS_KN_BLOCK_SIZE -#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_ASM_Z -#endif -#if BLIS_KN_BLOCK_SIZE_IND_C > BLIS_KN_BLOCK_SIZE -#undef BLIS_KN_BLOCK_SIZE -#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_IND_C -#endif -#if BLIS_KN_BLOCK_SIZE_IND_Z > BLIS_KN_BLOCK_SIZE -#undef BLIS_KN_BLOCK_SIZE -#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_IND_Z -#endif - -// -// Find the largest block size for panels of C. -// -#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_ASM_S -#if BLIS_MN_BLOCK_SIZE_ASM_D > BLIS_MN_BLOCK_SIZE -#undef BLIS_MN_BLOCK_SIZE -#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_ASM_D -#endif -#if BLIS_MN_BLOCK_SIZE_ASM_C > BLIS_MN_BLOCK_SIZE -#undef BLIS_MN_BLOCK_SIZE -#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_ASM_C -#endif -#if BLIS_MN_BLOCK_SIZE_ASM_Z > BLIS_MN_BLOCK_SIZE -#undef BLIS_MN_BLOCK_SIZE -#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_ASM_Z -#endif -#if BLIS_MN_BLOCK_SIZE_IND_C > BLIS_MN_BLOCK_SIZE -#undef BLIS_MN_BLOCK_SIZE -#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_IND_C -#endif -#if BLIS_MN_BLOCK_SIZE_IND_Z > BLIS_MN_BLOCK_SIZE -#undef BLIS_MN_BLOCK_SIZE -#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_IND_Z -#endif - - -// -- Compute pool sizes ------------------------------------------------------- - - -// Define each pool's total size using the block sizes determined above. -// These values are used in bli_mem.c to size the static memory pool -// arrays. - -// -// Pool for MC x KC blocks of A. -// -#define BLIS_MK_POOL_SIZE ( \ - BLIS_NUM_MC_X_KC_BLOCKS * \ - ( BLIS_MK_BLOCK_SIZE + \ - BLIS_CONTIG_ADDR_ALIGN_SIZE \ - ) + \ - BLIS_MAX_PRELOAD_BYTE_OFFSET \ - ) - -// -// Pool for KC x NC panels of B. -// -#define BLIS_KN_POOL_SIZE ( \ - BLIS_NUM_KC_X_NC_BLOCKS * \ - ( BLIS_KN_BLOCK_SIZE + \ - BLIS_CONTIG_ADDR_ALIGN_SIZE \ - ) + \ - BLIS_MAX_PRELOAD_BYTE_OFFSET \ - ) - -// -// Pool for MC x NC panels of C. -// -#define BLIS_MN_POOL_SIZE ( \ - BLIS_NUM_MC_X_NC_BLOCKS * \ - ( BLIS_MN_BLOCK_SIZE + \ - BLIS_CONTIG_ADDR_ALIGN_SIZE \ - ) + \ - BLIS_MAX_PRELOAD_BYTE_OFFSET \ - ) - - -#endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index e5d83bef3..a8f5bcb5b 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -91,6 +91,11 @@ typedef guint_t objbits_t; // object information bit field // -- Real types -- +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. @@ -487,26 +492,41 @@ typedef enum // -- BLIS misc. structure types ----------------------------------------------- // -// -- Memory pool type -- +// -- Pool block type -- typedef struct { - void** block_ptrs; - gint_t top_index; - siz_t num_blocks; - siz_t block_size; + void* buf_sys; + void* buf_align; +} pblk_t; + +// -- Pool type -- + +typedef struct +{ + pblk_t* block_ptrs; + dim_t block_ptrs_len; + + dim_t top_index; + dim_t num_blocks; + + siz_t block_size; + siz_t align_size; } pool_t; // -- Memory object type -- typedef struct mem_s { - void* buf; + pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; +// -- Memory block type -- + + // -- Blocksize object type -- typedef struct blksz_s @@ -711,17 +731,17 @@ typedef enum typedef enum { - BLIS_MACH_EPS = 0, - BLIS_MACH_SFMIN, - BLIS_MACH_BASE, - BLIS_MACH_PREC, - BLIS_MACH_NDIGMANT, - BLIS_MACH_RND, - BLIS_MACH_EMIN, - BLIS_MACH_RMIN, - BLIS_MACH_EMAX, - BLIS_MACH_RMAX, - BLIS_MACH_EPS2 + BLIS_MACH_EPS = 0, + BLIS_MACH_SFMIN, + BLIS_MACH_BASE, + BLIS_MACH_PREC, + BLIS_MACH_NDIGMANT, + BLIS_MACH_RND, + BLIS_MACH_EMIN, + BLIS_MACH_RMIN, + BLIS_MACH_EMAX, + BLIS_MACH_RMAX, + BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 diff --git a/frame/include/blis.h b/frame/include/blis.h index 045cdc894..2b2790802 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -54,6 +54,7 @@ extern "C" { // KNOW WHAT YOU ARE DOING. #include "bli_config.h" +#include "bli_config_macro_defs.h" // -- System headers -- @@ -98,7 +99,7 @@ extern "C" { // -- BLIS memory pool definitions -- -#include "bli_mem_pool_macro_defs.h" +//#include "bli_mem_pool_macro_defs.h" // -- Base operation prototypes -- @@ -108,6 +109,8 @@ extern "C" { #include "bli_malloc.h" #include "bli_obj.h" #include "bli_obj_scalar.h" +#include "bli_ind.h" +#include "bli_pool.h" #include "bli_mem.h" #include "bli_part.h" #include "bli_query.h" @@ -123,7 +126,6 @@ extern "C" { #include "bli_opid.h" #include "bli_flops.h" #include "bli_cntl.h" -#include "bli_ind.h" #include "bli_info.h" diff --git a/frame/ind/query/bli_bsv_query.c b/frame/ind/query/bli_bsv_query.c index 5124a1993..73f4a7738 100644 --- a/frame/ind/query/bli_bsv_query.c +++ b/frame/ind/query/bli_bsv_query.c @@ -167,11 +167,11 @@ blksz_t* bli_bsv_get_avail_blksz( bszid_t bsv, opid_t oper, num_t dt ) blksz_t* bli_bsv_get_blksz( bszid_t bsv, ind_t method ) { - // Initialize BLIS, if it isn't already initialized. This is - // needed because we have to ensure that the blksz_t objects - // have been created, otherwise this function could return a - // NULL (or garbage) address. - bli_init(); + // Initialize the cntl API, if it isn't already initialized. This is + // needed because we have to ensure that the blksz_t objects have + // been created, otherwise this function could return a NULL (or + // garbage) address. + bli_cntl_init(); return *(bli_bsizes[ method ][ bsv ]); } diff --git a/frame/ind/query/bli_ind_query.c b/frame/ind/query/bli_ind_query.c index 1e0f56397..d07abc7aa 100644 --- a/frame/ind/query/bli_ind_query.c +++ b/frame/ind/query/bli_ind_query.c @@ -34,8 +34,6 @@ #include "blis.h" -static bool_t bli_ind_is_init = FALSE; - static void* bli_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = { /* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */ @@ -213,6 +211,8 @@ char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ) // ----------------------------------------------------------------------------- +static bool_t bli_ind_is_init = FALSE; + void bli_ind_init( void ) { // If the API is already initialized, return early. diff --git a/frame/ind/query/bli_ukr_query.c b/frame/ind/query/bli_ukr_query.c index 464016300..61d07d6fc 100644 --- a/frame/ind/query/bli_ukr_query.c +++ b/frame/ind/query/bli_ukr_query.c @@ -200,10 +200,10 @@ func_t* bli_ukr_get_funcs( l3ukr_t ukr, ind_t method ) { func_t** p = bli_ukrs[ method ][ ukr ]; - // Initialize BLIS, if it isn't already initialized. This is - // needed because we have to ensure that the ukr func_t objects - // have been created (and thus contain valid function pointers). - bli_init(); + // Initialize the cntl API, if it isn't already initialized. This is + // needed because we have to ensure that the ukr func_t objects have + // been created (and thus contain valid function pointers). + bli_cntl_init(); // Avoid dereferencing NULL pointers. (A NULL pointer indicates that // there is no kernel for the requested kernel type and method.) @@ -215,10 +215,10 @@ func_t* bli_ukr_get_ref_funcs( l3ukr_t ukr ) { func_t** p = bli_ref_ukrs[ ukr ]; - // Initialize BLIS, if it isn't already initialized. This is - // needed because we have to ensure that the ukr func_t objects - // have been created (and thus contain valid function pointers). - bli_init(); + // Initialize the cntl API, if it isn't already initialized. This is + // needed because we have to ensure that the ukr func_t objects have + // been created (and thus contain valid function pointers). + bli_cntl_init(); // Avoid dereferencing NULL pointers. (A NULL pointer indicates that // there is no reference kernel for the requested kernel type.) diff --git a/testsuite/src/test_addv.c b/testsuite/src/test_addv.c index b8909944a..8f2cf880e 100644 --- a/testsuite/src/test_addv.c +++ b/testsuite/src/test_addv.c @@ -164,7 +164,7 @@ void libblis_test_addv_experiment( test_params_t* params, } // Estimate the performance of the best experiment repeat. - *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; + *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( x ) ) *perf *= 2.0; // Perform checks. diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index a493a3a85..e6ae32648 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -595,39 +595,27 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS library info -------------------------------------\n" ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "version string %s\n", bli_info_get_version_str() ); + libblis_test_fprintf_c( os, "version string %s\n", bli_info_get_version_str() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "--- BLIS config header ---\n" ); + libblis_test_fprintf_c( os, "--- BLIS configuration info ---\n" ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "integer type size (bits) %d\n", ( int )int_type_size ); - libblis_test_fprintf_c( os, "# of floating-point types %d\n", ( int )bli_info_get_num_fp_types() ); - libblis_test_fprintf_c( os, "maximum type size %d\n", ( int )bli_info_get_max_type_size() ); + libblis_test_fprintf_c( os, "integer type size (bits) %d\n", ( int )int_type_size ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "maximum number of threads %d\n", ( int )bli_info_get_max_num_threads() ); + libblis_test_fprintf_c( os, "SIMD alignment (bytes) %d\n", ( int )bli_info_get_simd_align_size() ); + libblis_test_fprintf_c( os, "Page size (bytes) %d\n", ( int )bli_info_get_page_size() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "SIMD alignment (bytes) %d\n", ( int )bli_info_get_simd_align_size() ); + libblis_test_fprintf_c( os, "memory alignment (bytes) \n" ); + libblis_test_fprintf_c( os, " stack address (def: simd) %d\n", ( int )bli_info_get_stack_buf_align_size() ); + libblis_test_fprintf_c( os, " obj_t address (def: simd) %d\n", ( int )bli_info_get_heap_addr_align_size() ); + libblis_test_fprintf_c( os, " obj_t stride (def: simd) %d\n", ( int )bli_info_get_heap_stride_align_size() ); + libblis_test_fprintf_c( os, " pool block addr (def: page) %d\n", ( int )bli_info_get_pool_addr_align_size() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "stack memory allocation \n" ); - libblis_test_fprintf_c( os, " address alignment (bytes) %d\n", ( int )bli_info_get_stack_buf_align_size() ); + libblis_test_fprintf_c( os, "BLAS compatibility layer \n" ); + libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_blas2blis() ); + libblis_test_fprintf_c( os, " integer type size (bits) %d\n", ( int )bli_info_get_blas2blis_int_type_size() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "dynamic memory allocation \n" ); - libblis_test_fprintf_c( os, " address alignment %d\n", ( int )bli_info_get_heap_addr_align_size() ); - libblis_test_fprintf_c( os, " stride alignment %d\n", ( int )bli_info_get_heap_stride_align_size() ); - libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "contiguous memory allocation \n" ); - libblis_test_fprintf_c( os, " # of mc x kc blocks %d\n", ( int )bli_info_get_num_mc_x_kc_blocks() ); - libblis_test_fprintf_c( os, " # of kc x nc blocks %d\n", ( int )bli_info_get_num_kc_x_nc_blocks() ); - libblis_test_fprintf_c( os, " # of mc x nc blocks %d\n", ( int )bli_info_get_num_mc_x_nc_blocks() ); - libblis_test_fprintf_c( os, " block address alignment %d\n", ( int )bli_info_get_contig_addr_align_size() ); - libblis_test_fprintf_c( os, " max preload byte offset %d\n", ( int )bli_info_get_max_preload_byte_offset() ); - libblis_test_fprintf_c( os, " actual pool sizes (bytes) \n" ); - libblis_test_fprintf_c( os, " for mc x kc blocks of A %d\n", ( int )bli_info_get_mk_pool_size() ); - libblis_test_fprintf_c( os, " for kc x nc panels of B %d\n", ( int )bli_info_get_kn_pool_size() ); - libblis_test_fprintf_c( os, " for mc x nc panels of C %d\n", ( int )bli_info_get_mn_pool_size() ); - libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "BLAS compatibility layer \n" ); - libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_blas2blis() ); - libblis_test_fprintf_c( os, " integer type size (bits) %d\n", ( int )bli_info_get_blas2blis_int_type_size() ); + libblis_test_fprintf_c( os, "CBLAS compatibility layer \n" ); + libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_cblas() ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "floating-point types s d c z \n" ); libblis_test_fprintf_c( os, " sizes (bytes) %7u %7u %7u %7u\n", sizeof(float), @@ -862,18 +850,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS misc. other info ---\n" ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "micro-panel alignment (bytes) s d c z \n" ); - libblis_test_fprintf_c( os, " A (left matrix) %7d %7d %7d %7d\n", - ( int )bli_info_get_upanel_a_align_size_s(), - ( int )bli_info_get_upanel_a_align_size_d(), - ( int )bli_info_get_upanel_a_align_size_c(), - ( int )bli_info_get_upanel_a_align_size_z() ); - libblis_test_fprintf_c( os, " B (right matrix) %7d %7d %7d %7d\n", - ( int )bli_info_get_upanel_b_align_size_s(), - ( int )bli_info_get_upanel_b_align_size_d(), - ( int )bli_info_get_upanel_b_align_size_c(), - ( int )bli_info_get_upanel_b_align_size_z() ); - libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" ); libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_l2_mc_s(), diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c index 2816c8a08..8cd1fdd2b 100644 --- a/testsuite/src/test_scal2m.c +++ b/testsuite/src/test_scal2m.c @@ -181,7 +181,7 @@ void libblis_test_scal2m_experiment( test_params_t* params, } // Estimate the performance of the best experiment repeat. - *perf = ( 2.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF; + *perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( y ) ) *perf *= 4.0; // Perform checks. diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c index 184b50067..803e5e2cf 100644 --- a/testsuite/src/test_scal2v.c +++ b/testsuite/src/test_scal2v.c @@ -179,7 +179,7 @@ void libblis_test_scal2v_experiment( test_params_t* params, } // Estimate the performance of the best experiment repeat. - *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; + *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( y ) ) *perf *= 4.0; // Perform checks. diff --git a/testsuite/src/test_subv.c b/testsuite/src/test_subv.c index 2b7f8c3ce..c6379bf1e 100644 --- a/testsuite/src/test_subv.c +++ b/testsuite/src/test_subv.c @@ -165,7 +165,7 @@ void libblis_test_subv_experiment( test_params_t* params, } // Estimate the performance of the best experiment repeat. - *perf = ( 2.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; + *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( x ) ) *perf *= 2.0; // Perform checks.