From b6ef84fad1c9884c84b7f1350a0bcdfe1737e8f2 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 21 Apr 2013 15:00:24 -0500 Subject: [PATCH] Allow ldim of packed micro-panels != MR, NR. Details: - Made substantial changes throughout the framework to decouple the leading dimension (row or column stride) used within each packed micro-panel from the corresponding register blocksize. It appears advantageous on some systems to use, for example, packed micro-panels of A where the column stride is greater than MR (whereas previously it was always equal to MR). - Changes include: - Added BLIS_EXTEND_[MNK]R_? macros, which specify how much extra padding to use when packing micro-panels of A and B. - Adjusted all packing routines and macro-kernels to use PACKMR and PACKNR where appropriate, instead of MR and NR. - Added pd field (panel dimension) to obj_t. - New interface to bli_packm_cntl_obj_create(). - Renamed bli_obj_packed_length()/_width() macros to bli_obj_padded_length()/_width(). - Removed local #defines for cache/register blocksizes in level-3 *_cntl.c. - Print out new cache and register blocksize extensions in test suite. - Also added new BLIS_EXTEND_[MNK]C_? macros for future use in using a larger blocksize for edge cases, which can improve performance at the margins. --- config/clarksville/bli_kernel.h | 51 ++- config/flame/bli_config.h | 137 +++++++ config/flame/bli_kernel.h | 335 ++++++++++++++++++ config/flame/kernels | 1 + config/flame/make_defs.mk | 104 ++++++ config/reference/bli_kernel.h | 49 ++- frame/1/packv/bli_packv_init.c | 4 +- frame/1m/packm/bli_packm_blk_var2.c | 24 +- frame/1m/packm/bli_packm_blk_var2.h | 3 +- frame/1m/packm/bli_packm_blk_var3.c | 36 +- frame/1m/packm/bli_packm_blk_var3.h | 3 +- frame/1m/packm/bli_packm_cntl.c | 103 ++---- frame/1m/packm/bli_packm_cntl.h | 24 +- frame/1m/packm/bli_packm_init.c | 61 ++-- frame/1m/packm/bli_packm_init.h | 6 +- frame/1m/packm/bli_packm_part.c | 12 +- frame/1m/packm/bli_packm_unb_var1.c | 4 +- frame/1m/packm/old/bli_packm_blk_var1.c | 4 +- frame/3/gemm/bli_gemm_cntl.c | 127 +++---- frame/3/gemm/bli_gemm_ker_var2.c | 7 +- frame/3/gemm/ukernels/bli_gemm_ref_mxn.c | 16 +- frame/3/hemm/bli_hemm_cntl.c | 127 +++---- frame/3/her2k/bli_her2k_cntl.c | 127 +++---- frame/3/herk/bli_herk_cntl.c | 127 +++---- frame/3/herk/bli_herk_l_ker_var2.c | 2 +- frame/3/herk/bli_herk_u_ker_var2.c | 2 +- frame/3/trmm/bli_trmm_cntl.c | 131 +++---- frame/3/trmm/bli_trmm_l_ker_var2.c | 7 +- frame/3/trmm/bli_trmm_u_ker_var2.c | 7 +- frame/3/trmm3/bli_trmm3_cntl.c | 131 +++---- frame/3/trsm/bli_trsm_cntl.c | 131 +++---- frame/3/trsm/bli_trsm_l_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_u_ker_var2.c | 12 +- .../3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c | 4 +- .../3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c | 4 +- frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c | 11 +- frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c | 11 +- frame/base/bli_blocksize.c | 139 ++++++++ frame/base/bli_mem.c | 6 +- frame/base/bli_obj.c | 4 +- frame/include/bli_kernel_macro_defs.h | 226 ++++++++++-- frame/include/bli_obj_macro_defs.h | 36 +- frame/include/bli_type_defs.h | 11 +- test/test_blis2.c | 8 +- test/test_gemm.c | 12 +- test/test_hemm.c | 8 +- test/test_her2k.c | 8 +- test/test_herk.c | 8 +- test/test_trmm.c | 8 +- test/test_trsm.c | 8 +- testsuite/src/test_libblis.c | 29 ++ 51 files changed, 1669 insertions(+), 799 deletions(-) create mode 100644 config/flame/bli_config.h create mode 100644 config/flame/bli_kernel.h create mode 120000 config/flame/kernels create mode 100644 config/flame/make_defs.mk diff --git a/config/clarksville/bli_kernel.h b/config/clarksville/bli_kernel.h index 6f0e8b97f..e6aa0feb0 100644 --- a/config/clarksville/bli_kernel.h +++ b/config/clarksville/bli_kernel.h @@ -77,7 +77,29 @@ #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 8192 -// -- Default register blocksizes for inner kernel -- +// -- Ccache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. + +// NOTE: These values are not yet used. +#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) + +// -- Default register blocksizes for micro-kernel -- // NOTE: When using the reference configuration, these register blocksizes // in the m and n dimensions should all be equal to the size expected by @@ -104,6 +126,31 @@ #define BLIS_DEFAULT_KR_C 1 #define BLIS_DEFAULT_KR_Z 1 +// -- Register blocksize extensions (for packed micro-panels) -- + +// NOTE: These register blocksize "extensions" determine whether the +// leading dimensions used within the packed micro-panels are equal to +// or greater than their corresponding register blocksizes above. + +#define BLIS_EXTEND_MR_S 0 +#define BLIS_EXTEND_NR_S 0 + +#define BLIS_EXTEND_MR_D 0 +#define BLIS_EXTEND_NR_D 0 + +#define BLIS_EXTEND_MR_C 0 +#define BLIS_EXTEND_NR_C 0 + +#define BLIS_EXTEND_MR_Z 0 +#define BLIS_EXTEND_NR_Z 0 + +// Register blocksize extensions in the k dimension are not used. + +#define BLIS_EXTEND_KR_S 0 +#define BLIS_EXTEND_KR_D 0 +#define BLIS_EXTEND_KR_C 0 +#define BLIS_EXTEND_KR_Z 0 + // -- Number of elements per vector register -- // NOTE: These constants are typically only used to determine the amount @@ -162,7 +209,7 @@ #define BLIS_DEFAULT_L2_NC_C 1000 #define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 +#define BLIS_DEFAULT_L2_NC_Z 1000 diff --git a/config/flame/bli_config.h b/config/flame/bli_config.h new file mode 100644 index 000000000..65065bfa2 --- /dev/null +++ b/config/flame/bli_config.h @@ -0,0 +1,137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CONFIG_H +#define BLIS_CONFIG_H + + +// -- OPERATING SYSTEM --------------------------------------------------------- + + + +// -- FLOATING-POINT PROPERTIES ------------------------------------------------ + +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + + + +// -- MULTITHREADING ----------------------------------------------------------- + +// The maximum number of BLIS threads that will run concurrently. +#define BLIS_MAX_NUM_THREADS 24 + + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +// -- Contiguous (static) memory allocator -- + +// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the +// contiguous memory pools. +#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_KC_X_NC_BLOCKS 1 +#define BLIS_NUM_MC_X_NC_BLOCKS 0 + +// The maximum preload byte offset is used to pad the end of the contiguous +// memory pools so that the micro-kernel, when computing with the end of the +// last block, can exceed the bounds of the usable portion of the memory +// region without causing a segmentation fault. +#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 + +// -- Memory alignment -- + +// It is sometimes useful to define the various memory alignments in terms +// of some other characteristics of the system, such as the cache line size +// and the page size. +#define BLIS_CACHE_LINE_SIZE 64 +#define BLIS_PAGE_SIZE 4096 + +// Alignment size used to align local stack buffers within macro-kernel +// functions. +#define BLIS_STACK_BUF_ALIGN_SIZE 16 + +// Alignment size used when allocating memory dynamically from the operating +// system (eg: posix_memalign()). To disable heap alignment and just use +// malloc() instead, set this to 1. +#define BLIS_HEAP_ADDR_ALIGN_SIZE 16 + +// Alignment size used when sizing leading dimensions of dynamically +// allocated memory. +#define BLIS_HEAP_STRIDE_ALIGN_SIZE 16 + +// Alignment size used when allocating entire blocks of contiguous memory +// from the contiguous memory allocator. +#define BLIS_CONTIG_ADDR_ALIGN_SIZE 16 + +// Alignment size used when sizing strides (eg: of packed micro-panels) +// within a block of contiguous memory. +#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16 + + + +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Basic (homogeneous) datatype support always enabled. + +// Enable mixed domain operations? +//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT + +// Enable extra mixed precision operations? +//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT + + + +// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- + +// Stay initialized after auto-initialization, unless and until the user +// explicitly calls bli_finalize(). +#define BLIS_ENABLE_STAY_AUTO_INITIALIZED + + + +// -- BLAS-to-BLIS COMPATIBILITY LAYER ----------------------------------------- + +// Enable the BLAS compatibility layer? +#define BLIS_ENABLE_BLAS2BLIS + +// Fortran-77 name-mangling macros. +#define PASTEF77(ch1,name) ch1 ## name ## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ + + + + +#endif + diff --git a/config/flame/bli_kernel.h b/config/flame/bli_kernel.h new file mode 100644 index 000000000..1c606aed1 --- /dev/null +++ b/config/flame/bli_kernel.h @@ -0,0 +1,335 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_H +#define BLIS_KERNEL_H + + +// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- + +// -- Default cache blocksizes -- + +// +// Constraints: +// +// (1) MC must be a multiple of: +// (a) MR (for zero-padding purposes) +// (2) NC must be a multiple of +// (a) NR (for zero-padding purposes) +// (3) KC must be a multiple of +// (a) MR and +// (b) NR +// for triangular operations such as trmm and trsm. +// +// NOTE: For BLIS libraries built on block-panel macro-kernels, constraint (3b) +// is relaxed. In this case, (3a) is needed for operations where matrix A is +// triangular (trmm, trsm), because we want the diagonal offset of any packed +// panel of matrix A to be a multiple of MR. If, instead, the library were to +// be built on block-panel macro-kernels, the matrix with structure would be +// on the right, rather than the left, and thus it would be constraint (3b) +// that would be needed instead of (3a). +// + +#define BLIS_DEFAULT_MC_S 256 +#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_NC_S 8192 + +#define BLIS_DEFAULT_MC_D 128 +#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_NC_D 4096 + +#define BLIS_DEFAULT_MC_C 128 +#define BLIS_DEFAULT_KC_C 256 +#define BLIS_DEFAULT_NC_C 4096 + +#define BLIS_DEFAULT_MC_Z 64 +#define BLIS_DEFAULT_KC_Z 256 +#define BLIS_DEFAULT_NC_Z 2048 + +//#define BLIS_EDGECASE_HACK 1 + +// -- Default register blocksizes for inner kernel -- + +// NOTE: When using the reference configuration, these register blocksizes +// in the m and n dimensions should all be equal to the size expected by +// the reference micro-kernel(s). + +#define BLIS_DEFAULT_MR_S 8 +#define BLIS_DEFAULT_NR_S 4 + +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 2 + +#define BLIS_DEFAULT_MR_C 8 +#define BLIS_DEFAULT_NR_C 4 + +#define BLIS_DEFAULT_MR_Z 8 +#define BLIS_DEFAULT_NR_Z 4 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +#define BLIS_DEFAULT_KR_S 1 +#define BLIS_DEFAULT_KR_D 1 +#define BLIS_DEFAULT_KR_C 1 +#define BLIS_DEFAULT_KR_Z 1 + +// -- Number of elements per vector register -- + +// NOTE: These constants are typically only used to determine the amount +// of duplication needed when configuring level-3 macro-kernels that +// copy and duplicate elements of B to a temporary duplication buffer +// (so that element-wise vector multiplication and addition instructions +// can be used). + +#define BLIS_NUM_ELEM_PER_REG_S 4 +#define BLIS_NUM_ELEM_PER_REG_D 2 +#define BLIS_NUM_ELEM_PER_REG_C 2 +#define BLIS_NUM_ELEM_PER_REG_Z 1 + +// -- Default switch for duplication of B -- + +// NOTE: Setting these values to 1 disables duplication. Any value +// d > 1 results in a d-1 duplicates created within special macro-kernel +// buffer of dimension k x NR*d. + +//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S +//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D +//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C +//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z +#define BLIS_DEFAULT_NUM_DUPL_S 1 +#define BLIS_DEFAULT_NUM_DUPL_D 2 +#define BLIS_DEFAULT_NUM_DUPL_C 1 +#define BLIS_DEFAULT_NUM_DUPL_Z 1 + +// -- Default incremental packing blocksizes (n dimension) -- + +// NOTE: These incremental packing blocksizes (for the n dimension) are only +// used by certain blocked variants. But when the *are* used, they MUST be +// be an integer multiple of NR! + +#define BLIS_DEFAULT_NI_FAC 16 +#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S) +#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D) +#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C) +#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z) + + + +// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- + +// NOTE: These values determine high-level cache blocking for level-2 +// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and +// MC = NC = 1000, then a total of four unblocked (or unblocked fused) +// gemv subproblems are called. The blocked algorithms are only useful in +// that they provide the opportunity for packing vectors. (Matrices can also +// be packed here, but this tends to be much too expensive in practice to +// actually employ.) + +#define BLIS_DEFAULT_L2_MC_S 1000 +#define BLIS_DEFAULT_L2_NC_S 1000 + +#define BLIS_DEFAULT_L2_MC_D 1000 +#define BLIS_DEFAULT_L2_NC_D 1000 + +#define BLIS_DEFAULT_L2_MC_C 1000 +#define BLIS_DEFAULT_L2_NC_C 1000 + +#define BLIS_DEFAULT_L2_MC_Z 1000 +#define BLIS_DEFAULT_L2_NC_Z 1000 + + + +// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ + +// -- Default fusing factors for level-1f operations -- + +// NOTE: Default fusing factors are not used by the reference implementations +// of level-1f operations. They are here only for use when these operations +// are optimized. + +#define BLIS_DEFAULT_FUSING_FACTOR_S 8 +#define BLIS_DEFAULT_FUSING_FACTOR_D 4 +#define BLIS_DEFAULT_FUSING_FACTOR_C 4 +#define BLIS_DEFAULT_FUSING_FACTOR_Z 2 + + + +// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ + +// -- Default register blocksizes for vectors -- + +// NOTE: Register blocksizes for vectors are used when packing +// non-contiguous vectors. Similar to that of KR, they can +// typically be set to 1. + +#define BLIS_DEFAULT_VR_S 1 +#define BLIS_DEFAULT_VR_D 1 +#define BLIS_DEFAULT_VR_C 1 +#define BLIS_DEFAULT_VR_Z 1 + + + +// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- + +#include "bli_gemm_opt_d4x2.h" + +// -- dupl -- + +#define DUPL_KERNEL dupl_unb_var1 + +// -- gemm -- + +//#define GEMM_UKERNEL gemm_ref_4x4 +#define GEMM_UKERNEL gemm_opt_d4x2 + +// -- trsm-related -- + +//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_4x4 +//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_4x4 +#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn +#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn + +//#define TRSM_L_UKERNEL trsm_l_ref_4x4 +//#define TRSM_U_UKERNEL trsm_u_ref_4x4 +#define TRSM_L_UKERNEL trsm_l_ref_mxn +#define TRSM_U_UKERNEL trsm_u_ref_mxn + + + +// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- + +// -- packm -- + +#define PACKM_2XK_KERNEL packm_ref_2xk +#define PACKM_4XK_KERNEL packm_ref_4xk +#define PACKM_6XK_KERNEL packm_ref_6xk +#define PACKM_8XK_KERNEL packm_ref_8xk +#define PACKM_10XK_KERNEL packm_ref_10xk +#define PACKM_12XK_KERNEL packm_ref_12xk +#define PACKM_14XK_KERNEL packm_ref_14xk +#define PACKM_16XK_KERNEL packm_ref_16xk + +// -- unpackm -- + +#define UNPACKM_2XK_KERNEL unpackm_ref_2xk +#define UNPACKM_4XK_KERNEL unpackm_ref_4xk +#define UNPACKM_6XK_KERNEL unpackm_ref_6xk +#define UNPACKM_8XK_KERNEL unpackm_ref_8xk +#define UNPACKM_10XK_KERNEL unpackm_ref_10xk +#define UNPACKM_12XK_KERNEL unpackm_ref_12xk +#define UNPACKM_14XK_KERNEL unpackm_ref_14xk +#define UNPACKM_16XK_KERNEL unpackm_ref_16xk + + + +// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- + +// -- axpy2v -- + +#define AXPY2V_KERNEL axpy2v_unb_var1 + +// -- dotaxpyv -- + +#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 + +// -- axpyf -- + +#define AXPYF_KERNEL axpyf_unb_var1 + +// -- dotxf -- + +#define DOTXF_KERNEL dotxf_unb_var1 + +// -- dotxaxpyf -- + +#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 + + + +// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- + +// -- addv -- + +#define ADDV_KERNEL addv_unb_var1 + +// -- axpyv -- + +#define AXPYV_KERNEL axpyv_unb_var1 + +// -- copynzv -- + +#define COPYNZV_KERNEL copynzv_unb_var1 + +// -- copyv -- + +#define COPYV_KERNEL copyv_unb_var1 + +// -- dotv -- + +#define DOTV_KERNEL dotv_unb_var1 + +// -- dotxv -- + +#define DOTXV_KERNEL dotxv_unb_var1 + +// -- invertv -- + +#define INVERTV_KERNEL invertv_unb_var1 + +// -- scal2v -- + +#define SCAL2V_KERNEL scal2v_unb_var1 + +// -- scalv -- + +#define SCALV_KERNEL scalv_unb_var1 + +// -- setv -- + +#define SETV_KERNEL setv_unb_var1 + +// -- subv -- + +#define SUBV_KERNEL subv_unb_var1 + +// -- swapv -- + +#define SWAPV_KERNEL swapv_unb_var1 + + + +#endif + diff --git a/config/flame/kernels b/config/flame/kernels new file mode 120000 index 000000000..39a653f79 --- /dev/null +++ b/config/flame/kernels @@ -0,0 +1 @@ +../../kernels/x86/3/ \ No newline at end of file diff --git a/config/flame/make_defs.mk b/config/flame/make_defs.mk new file mode 100644 index 000000000..9dca5347f --- /dev/null +++ b/config/flame/make_defs.mk @@ -0,0 +1,104 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2013, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# Only include this block of code once. +ifndef MAKE_DEFS_MK_INCLUDED +MAKE_DEFS_MK_INCLUDED := yes + + + +# +# --- Build definitions -------------------------------------------------------- +# + +# Variables corresponding to other configure-time options. +BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no +BLIS_ENABLE_STATIC_BUILD := yes +BLIS_ENABLE_DYNAMIC_BUILD := no + + + +# +# --- Utility program definitions ---------------------------------------------- +# + +SH := /bin/sh +MV := mv +MKDIR := mkdir -p +RM_F := rm -f +RM_RF := rm -rf +SYMLINK := ln -sf +FIND := find +XARGS := xargs +RANLIB := ranlib +INSTALL := install -c + +# Used to refresh CHANGELOG. +GIT := git +GIT_LOG := $(GIT) log --decorate + + + +# +# --- Development tools definitions -------------------------------------------- +# + +# --- Determine the C compiler and related flags --- +CC := gcc +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# NOTE: This is needed to enable posix_memalign(). +CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +CMISCFLAGS := -std=c99 # -fopenmp -pg +CDBGFLAGS := -g +CWARNFLAGS := -Wall +COPTFLAGS := -O2 -malign-double -funroll-loops +CVECFLAGS := -msse3 -march=native # -mfpmath=sse + +# Aggregate all of the flags into two groups: one for optimizable code, and +# one for code that should not be optimized. +CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) + +# --- Determine the archiver and related flags --- +AR := ar +ARFLAGS := cru + +# --- Determine the linker and related flags --- +LINKER := $(CC) +LDFLAGS := + + + +# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block +endif diff --git a/config/reference/bli_kernel.h b/config/reference/bli_kernel.h index 0a148289c..8bc00f496 100644 --- a/config/reference/bli_kernel.h +++ b/config/reference/bli_kernel.h @@ -77,7 +77,29 @@ #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 2048 -// -- Default register blocksizes for inner kernel -- +// -- Ccache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. + +// NOTE: These values are not yet used. +#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) + +// -- Default register blocksizes for micro-kernel -- // NOTE: When using the reference configuration, these register blocksizes // in the m and n dimensions should all be equal to the size expected by @@ -104,6 +126,31 @@ #define BLIS_DEFAULT_KR_C 1 #define BLIS_DEFAULT_KR_Z 1 +// -- Register blocksize extensions (for packed micro-panels) -- + +// NOTE: These register blocksize "extensions" determine whether the +// leading dimensions used within the packed micro-panels are equal to +// or greater than their corresponding register blocksizes above. + +#define BLIS_EXTEND_MR_S 0 +#define BLIS_EXTEND_NR_S 0 + +#define BLIS_EXTEND_MR_D 0 +#define BLIS_EXTEND_NR_D 0 + +#define BLIS_EXTEND_MR_C 0 +#define BLIS_EXTEND_NR_C 0 + +#define BLIS_EXTEND_MR_Z 0 +#define BLIS_EXTEND_NR_Z 0 + +// Register blocksize extensions in the k dimension are not used. + +#define BLIS_EXTEND_KR_S 0 +#define BLIS_EXTEND_KR_D 0 +#define BLIS_EXTEND_KR_C 0 +#define BLIS_EXTEND_KR_Z 0 + // -- Number of elements per vector register -- // NOTE: These constants are typically only used to determine the amount diff --git a/frame/1/packv/bli_packv_init.c b/frame/1/packv/bli_packv_init.c index 34e3f8cd3..8bc1532a6 100644 --- a/frame/1/packv/bli_packv_init.c +++ b/frame/1/packv/bli_packv_init.c @@ -175,7 +175,7 @@ void bli_packv_init_pack( pack_t pack_schema, } // Save the padded (packed) dimensions into the packed object. - bli_obj_set_packed_dims( m_p_pad, 1, *p ); + bli_obj_set_padded_dims( m_p_pad, 1, *p ); // Grab the buffer address from the mem_t object and copy it to the // main object buffer field. (Sometimes this buffer address will be @@ -193,7 +193,7 @@ void bli_packv_init_pack( pack_t pack_schema, // how much space beyond the vector would need to be zero-padded, if // zero-padding was needed. rs_p = 1; - cs_p = bli_obj_packed_length( *p ); + cs_p = bli_obj_padded_length( *p ); bli_obj_set_incs( rs_p, cs_p, *p ); } diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index 7468192fa..ad4e67e09 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -48,7 +48,8 @@ typedef void (*FUNCPTR_T)( dim_t n_max, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p + void* p, inc_t rs_p, inc_t cs_p, + dim_t pd_p, inc_t ps_p ); static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2); @@ -68,8 +69,8 @@ void bli_packm_blk_var2( obj_t* beta, dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); - dim_t m_max_p = bli_obj_packed_length( *p ); - dim_t n_max_p = bli_obj_packed_width( *p ); + dim_t m_max_p = bli_obj_padded_length( *p ); + dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); @@ -78,6 +79,7 @@ void bli_packm_blk_var2( obj_t* beta, void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); + dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); void* buf_beta = bli_obj_scalar_buffer( dt_cp, *beta ); @@ -100,7 +102,8 @@ void bli_packm_blk_var2( obj_t* beta, n_max_p, buf_beta, buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, ps_p ); + buf_p, rs_p, cs_p, + pd_p, ps_p ); } @@ -119,7 +122,8 @@ void PASTEMAC(ch,varname )( \ dim_t n_max, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ ) \ { \ ctype* restrict beta_cast = beta; \ @@ -190,12 +194,12 @@ void PASTEMAC(ch,varname )( \ /* Prepare to pack to column panels. */ \ iter_dim = n; \ panel_len = m; \ - panel_dim = rs_p; \ + panel_dim = pd_p; \ incc = cs_c; \ ldc = rs_c; \ vs_c = cs_c; \ diagoffc_inc = -( doff_t)panel_dim; \ - ldp = panel_dim; \ + ldp = rs_p; \ m_panel = &m; \ n_panel = &panel_dim_i; \ m_panel_max = m_max; \ @@ -206,12 +210,12 @@ void PASTEMAC(ch,varname )( \ /* Prepare to pack to row panels. */ \ iter_dim = m; \ panel_len = n; \ - panel_dim = cs_p; \ + panel_dim = pd_p; \ incc = rs_c; \ ldc = cs_c; \ vs_c = rs_c; \ diagoffc_inc = ( doff_t )panel_dim; \ - ldp = panel_dim; \ + ldp = cs_p; \ m_panel = &panel_dim_i; \ n_panel = &n; \ m_panel_max = panel_dim; \ @@ -433,7 +437,7 @@ void PASTEMAC(ch,varname )( \ /* if ( rs_p == 1 ) \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: a copied", m_panel_max, n_panel_max, \ - p_begin, 1, panel_dim, "%4.1f", "" ); \ + p_begin, 1, cs_p, "%4.1f", "" ); \ if ( cs_p == 1 ) \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \ p_begin, panel_dim, 1, "%6.3f", "" ); \ diff --git a/frame/1m/packm/bli_packm_blk_var2.h b/frame/1m/packm/bli_packm_blk_var2.h index c5d3acca0..8022f7ff6 100644 --- a/frame/1m/packm/bli_packm_blk_var2.h +++ b/frame/1m/packm/bli_packm_blk_var2.h @@ -52,7 +52,8 @@ void PASTEMAC(ch,varname)( \ dim_t n_max, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ ); INSERT_GENTPROT_BASIC( packm_blk_var2 ) diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c index 3a81fa139..de5c7074b 100644 --- a/frame/1m/packm/bli_packm_blk_var3.c +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -51,7 +51,8 @@ typedef void (*FUNCPTR_T)( dim_t n_max, void* beta, void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p + void* p, inc_t rs_p, inc_t cs_p, + dim_t pd_p, inc_t ps_p ); static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3); @@ -74,8 +75,8 @@ void bli_packm_blk_var3( obj_t* beta, dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); - dim_t m_max_p = bli_obj_packed_length( *p ); - dim_t n_max_p = bli_obj_packed_width( *p ); + dim_t m_max_p = bli_obj_padded_length( *p ); + dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); @@ -84,6 +85,7 @@ void bli_packm_blk_var3( obj_t* beta, void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); + dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); void* buf_beta = bli_obj_scalar_buffer( dt_cp, *beta ); @@ -109,7 +111,8 @@ void bli_packm_blk_var3( obj_t* beta, n_max_p, buf_beta, buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, ps_p ); + buf_p, rs_p, cs_p, + pd_p, ps_p ); } @@ -131,7 +134,8 @@ void PASTEMAC(ch,varname )( \ dim_t n_max, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ ) \ { \ ctype* restrict beta_cast = beta; \ @@ -159,7 +163,7 @@ void PASTEMAC(ch,varname )( \ dim_t panel_off_i; \ inc_t vs_c; \ inc_t incc, ldc; \ - inc_t p_inc; \ + inc_t ldp, p_inc; \ dim_t* m_panel; \ dim_t* n_panel; \ dim_t m_panel_use; \ @@ -199,11 +203,12 @@ void PASTEMAC(ch,varname )( \ iter_dim = n; \ panel_len = m; \ panel_len_max = m_max; \ - panel_dim = rs_p; \ + panel_dim = pd_p; \ incc = cs_c; \ ldc = rs_c; \ vs_c = cs_c; \ diagoffc_inc = -( doff_t)panel_dim; \ + ldp = rs_p; \ m_panel = &m; \ n_panel = &panel_dim_i; \ } \ @@ -213,11 +218,12 @@ void PASTEMAC(ch,varname )( \ iter_dim = m; \ panel_len = n; \ panel_len_max = n_max; \ - panel_dim = cs_p; \ + panel_dim = pd_p; \ incc = rs_c; \ ldc = cs_c; \ vs_c = rs_c; \ diagoffc_inc = ( doff_t )panel_dim; \ + ldp = cs_p; \ m_panel = &panel_dim_i; \ n_panel = &n; \ } \ @@ -303,7 +309,7 @@ void PASTEMAC(ch,varname )( \ panel_len_i, \ beta_cast, \ c_use, incc, ldc, \ - p_use, panel_dim ); \ + p_use, ldp ); \ \ /* If the diagonal of C is implicitly unit, set the diagonal of the packed panel to unit. */ \ @@ -351,7 +357,7 @@ void PASTEMAC(ch,varname )( \ p_use, rs_p, cs_p ); \ } \ \ - p_inc = panel_dim * panel_len_max_i; \ + p_inc = ldp * panel_len_max_i; \ } \ else \ { \ @@ -369,9 +375,9 @@ void PASTEMAC(ch,varname )( \ panel_len_i, \ beta_cast, \ c_use, incc, ldc, \ - p_use, panel_dim ); \ + p_use, ldp ); \ \ - p_inc = panel_dim * panel_len_max_i; \ + p_inc = ldp * panel_len_max_i; \ } \ \ /* If necessary, zero-pad at the edge of the panel dimension (ie: the @@ -382,7 +388,7 @@ void PASTEMAC(ch,varname )( \ dim_t m_edge = panel_dim - i; \ dim_t n_edge = panel_len_max_i; \ inc_t rs_pe = 1; \ - inc_t cs_pe = panel_dim; \ + inc_t cs_pe = ldp; \ ctype* p_edge = p_begin + (i )*rs_pe; \ \ PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ @@ -402,7 +408,7 @@ void PASTEMAC(ch,varname )( \ dim_t m_edge = panel_dim; \ dim_t n_edge = panel_len_max_i - j; \ inc_t rs_pe = 1; \ - inc_t cs_pe = panel_dim; \ + inc_t cs_pe = ldp; \ ctype* p_edge = p_begin + (j )*cs_pe; \ \ PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ @@ -427,7 +433,7 @@ void PASTEMAC(ch,varname )( \ dim_t m_br = panel_dim - i; \ dim_t n_br = panel_len_max_i - j; \ inc_t rs_pe = 1; \ - inc_t cs_pe = panel_dim; \ + inc_t cs_pe = ldp; \ ctype* p_edge = p_begin + (i )*rs_pe + (j )*cs_pe; \ \ PASTEMAC2(ch,ch,setd_unb_var1)( 0, \ diff --git a/frame/1m/packm/bli_packm_blk_var3.h b/frame/1m/packm/bli_packm_blk_var3.h index 0cb8e9be8..dd0ca1a09 100644 --- a/frame/1m/packm/bli_packm_blk_var3.h +++ b/frame/1m/packm/bli_packm_blk_var3.h @@ -55,7 +55,8 @@ void PASTEMAC(ch,varname)( \ dim_t n_max, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ ); INSERT_GENTPROT_BASIC( packm_blk_var3 ) diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index bb3f71001..66ee98177 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -50,6 +50,9 @@ packm_t* packm_cntl_scale; blksz_t* packm_mult_ldim; blksz_t* packm_mult_nvec; +blksz_t* packm_mult_mext; +blksz_t* packm_mult_next; + void bli_packm_cntl_init() { // Create blocksize objects for m and n register blocking. We will attach @@ -70,6 +73,11 @@ void bli_packm_cntl_init() BLIS_DEFAULT_NR_C, BLIS_DEFAULT_NR_Z ); + // Create blocksize extensions that simply contain zero, as these + // fields are not used except by level-3 operations. + packm_mult_mext = bli_blksz_obj_create( 0, 0, 0, 0 ); + packm_mult_next = bli_blksz_obj_create( 0, 0, 0, 0 ); + // Generally speaking, the BLIS_PACKED_ROWS and BLIS_PACKED_COLUMNS // are used by the level-2 operations, and thus densification is not // necessary. These schemas amount to simple copies to row or column @@ -89,7 +97,9 @@ void bli_packm_cntl_init() bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, // When packing to rows: packm_mult_nvec, // - nvec multiple is used for m dimension + packm_mult_mext, // - m extension is zero / unused packm_mult_ldim, // - ldim multiple is used for n dimension + packm_mult_next, // - n extension is zero / unused FALSE, // do NOT scale FALSE, // do NOT densify structure FALSE, // do NOT invert diagonal @@ -102,7 +112,9 @@ void bli_packm_cntl_init() bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, // When packing to rows: packm_mult_nvec, // - nvec multiple is used for m dimension + packm_mult_mext, // - m extension is zero / unused packm_mult_ldim, // - ldim multiple is used for n dimension + packm_mult_next, // - n extension is zero / unused TRUE, // do scale FALSE, // do NOT densify structure FALSE, // do NOT invert diagonal @@ -118,7 +130,9 @@ void bli_packm_cntl_init() bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, // When packing to columns: packm_mult_ldim, // - ldim multiple is used for m dimension + packm_mult_mext, // - m extension is zero / unused packm_mult_nvec, // - nvec multiple is used for n dimension + packm_mult_next, // - n extension is zero / unused FALSE, // do NOT scale FALSE, // do NOT densify structure FALSE, // do NOT invert diagonal @@ -131,7 +145,9 @@ void bli_packm_cntl_init() bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, // When packing to columns: packm_mult_ldim, // - ldim multiple is used for m dimension + packm_mult_mext, // - m extension is zero / unused packm_mult_nvec, // - nvec multiple is used for n dimension + packm_mult_next, // - n extension is zero / unused TRUE, // do scale FALSE, // do NOT densify structure FALSE, // do NOT invert diagonal @@ -141,64 +157,6 @@ void bli_packm_cntl_init() BLIS_BUFFER_FOR_GEN_USE ); - // Create control trees to pack by row panels (with and without scaling). - packm_cntl_rpn_noscale - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to row panels: - packm_mult_nvec, // - nvec multiple is used for panel length - packm_mult_ldim, // - ldim multiple is used for panel width - FALSE, // do NOT scale - TRUE, // densify structure - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_GEN_USE ); - packm_cntl_rpn_scale - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to row panels: - packm_mult_nvec, // - nvec multiple is used for panel length - packm_mult_ldim, // - ldim multiple is used for panel width - TRUE, // do scale - TRUE, // densify structure - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_GEN_USE ); - - - // Create control trees to pack by column panels (with and without scaling). - packm_cntl_cpn_noscale - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to column panels: - packm_mult_ldim, // - ldim multiple is used for panel length - packm_mult_nvec, // - nvec multiple is used for panel width - FALSE, // do NOT scale - TRUE, // densify structure - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_GEN_USE ); - packm_cntl_cpn_scale - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to column panels: - packm_mult_ldim, // - ldim multiple is used for panel length - packm_mult_nvec, // - nvec multiple is used for panel width - TRUE, // do scale - TRUE, // densify structure - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_GEN_USE ); - - // Set defaults when we don't care whether the packing is by rows or // by columns. packm_cntl_noscale = packm_cntl_col_noscale; @@ -212,19 +170,16 @@ void bli_packm_cntl_finalize() bli_cntl_obj_free( packm_cntl_col_noscale ); bli_cntl_obj_free( packm_cntl_col_scale ); - bli_cntl_obj_free( packm_cntl_rpn_noscale ); - bli_cntl_obj_free( packm_cntl_rpn_scale ); - bli_cntl_obj_free( packm_cntl_cpn_noscale ); - bli_cntl_obj_free( packm_cntl_cpn_scale ); - bli_blksz_obj_free( packm_mult_ldim ); bli_blksz_obj_free( packm_mult_nvec ); } packm_t* bli_packm_cntl_obj_create( impl_t impl_type, varnum_t var_num, - blksz_t* mult_m, - blksz_t* mult_n, + blksz_t* mr_def, + blksz_t* mr_ext, + blksz_t* nr_def, + blksz_t* nr_ext, bool_t does_scale, bool_t does_densify, bool_t does_invert_diag, @@ -239,8 +194,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type, cntl->impl_type = impl_type; cntl->var_num = var_num; - cntl->mult_m = mult_m; - cntl->mult_n = mult_n; + cntl->mr_def = mr_def; + cntl->mr_ext = mr_ext; + cntl->nr_def = nr_def; + cntl->nr_ext = nr_ext; cntl->does_scale = does_scale; cntl->does_densify = does_densify; cntl->does_invert_diag = does_invert_diag; @@ -255,8 +212,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type, void bli_packm_cntl_obj_init( packm_t* cntl, impl_t impl_type, varnum_t var_num, - blksz_t* mult_m, - blksz_t* mult_n, + blksz_t* mr_def, + blksz_t* mr_ext, + blksz_t* nr_def, + blksz_t* nr_ext, bool_t does_scale, bool_t does_densify, bool_t does_invert_diag, @@ -267,8 +226,10 @@ void bli_packm_cntl_obj_init( packm_t* cntl, { cntl->impl_type = impl_type; cntl->var_num = var_num; - cntl->mult_m = mult_m; - cntl->mult_n = mult_n; + cntl->mr_def = mr_def; + cntl->mr_ext = mr_ext; + cntl->nr_def = nr_def; + cntl->nr_ext = nr_ext; cntl->does_scale = does_scale; cntl->does_densify = does_densify; cntl->does_invert_diag = does_invert_diag; diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index e4cfa2160..bc8c15c38 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -36,8 +36,10 @@ struct packm_s { impl_t impl_type; varnum_t var_num; - blksz_t* mult_m; - blksz_t* mult_n; + blksz_t* mr_def; + blksz_t* mr_ext; + blksz_t* nr_def; + blksz_t* nr_ext; bool_t does_scale; bool_t does_densify; bool_t does_invert_diag; @@ -48,8 +50,10 @@ struct packm_s }; typedef struct packm_s packm_t; -#define cntl_mult_m( cntl ) cntl->mult_m -#define cntl_mult_n( cntl ) cntl->mult_n +#define cntl_mr_def( cntl ) cntl->mr_def +#define cntl_mr_ext( cntl ) cntl->mr_ext +#define cntl_nr_def( cntl ) cntl->nr_def +#define cntl_nr_ext( cntl ) cntl->nr_ext #define cntl_does_scale( cntl ) cntl->does_scale #define cntl_does_densify( cntl ) cntl->does_densify @@ -71,8 +75,10 @@ void bli_packm_cntl_init( void ); void bli_packm_cntl_finalize( void ); packm_t* bli_packm_cntl_obj_create( impl_t impl_type, varnum_t var_num, - blksz_t* mult_m, - blksz_t* mult_n, + blksz_t* mr_def, + blksz_t* mr_ext, + blksz_t* nr_def, + blksz_t* nr_ext, bool_t does_scale, bool_t does_densify, bool_t does_invert_diag, @@ -83,8 +89,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type, void bli_packm_cntl_obj_init( packm_t* cntl, impl_t impl_type, varnum_t var_num, - blksz_t* mult_m, - blksz_t* mult_n, + blksz_t* mr_def, + blksz_t* mr_ext, + blksz_t* nr_def, + blksz_t* nr_ext, bool_t does_scale, bool_t does_densify, bool_t does_invert_diag, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index e942a6b0d..64e67b34f 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -50,8 +50,10 @@ void bli_packm_init( obj_t* a, packord_t pack_ord_if_up; packord_t pack_ord_if_lo; packbuf_t pack_buf_type; - blksz_t* mult_m; - blksz_t* mult_n; + blksz_t* mr_def; + blksz_t* mr_ext; + blksz_t* nr_def; + blksz_t* nr_ext; obj_t c; // Check parameters. @@ -126,8 +128,10 @@ void bli_packm_init( obj_t* a, needs_densify = cntl_does_densify( cntl ); pack_schema = cntl_pack_schema( cntl ); pack_buf_type = cntl_pack_buf_type( cntl ); - mult_m = cntl_mult_m( cntl ); - mult_n = cntl_mult_n( cntl ); + mr_def = cntl_mr_def( cntl ); + mr_ext = cntl_mr_ext( cntl ); + nr_def = cntl_nr_def( cntl ); + nr_ext = cntl_nr_ext( cntl ); if ( cntl_does_invert_diag( cntl ) ) invert_diag = BLIS_INVERT_DIAG; else invert_diag = BLIS_NO_INVERT_DIAG; @@ -145,8 +149,8 @@ void bli_packm_init( obj_t* a, pack_ord_if_up, pack_ord_if_lo, pack_buf_type, - mult_m, - mult_n, + mr_def, mr_ext, + nr_def, nr_ext, &c, p ); @@ -160,8 +164,10 @@ void bli_packm_init_pack( bool_t densify, packord_t pack_ord_if_up, packord_t pack_ord_if_lo, packbuf_t pack_buf_type, - blksz_t* mult_m, - blksz_t* mult_n, + blksz_t* mr_def, + blksz_t* mr_ext, + blksz_t* nr_def, + blksz_t* nr_ext, obj_t* c, obj_t* p ) { @@ -169,8 +175,13 @@ void bli_packm_init_pack( bool_t densify, trans_t transc = bli_obj_trans_status( *c ); dim_t m_c = bli_obj_length( *c ); dim_t n_c = bli_obj_width( *c ); - dim_t mult_m_dim = bli_blksz_for_type( datatype, mult_m ); - dim_t mult_n_dim = bli_blksz_for_type( datatype, mult_n ); + dim_t mr_def_dim = bli_blksz_for_type( datatype, mr_def ); + dim_t mr_ext_dim = bli_blksz_for_type( datatype, mr_ext ); + dim_t nr_def_dim = bli_blksz_for_type( datatype, nr_def ); + dim_t nr_ext_dim = bli_blksz_for_type( datatype, nr_ext ); + + dim_t mr_pack_dim = mr_def_dim + mr_ext_dim; + dim_t nr_pack_dim = nr_def_dim + nr_ext_dim; mem_t* mem_p; dim_t m_p_pad, n_p_pad; @@ -227,13 +238,13 @@ void bli_packm_init_pack( bool_t densify, // in p) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. - m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mult_m_dim ); - n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), mult_n_dim ); + m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mr_def_dim ); + n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), nr_def_dim ); // Save the padded dimensions into the packed object. It is important // to save these dimensions since they represent the actual dimensions // of the zero-padded matrix. - bli_obj_set_packed_dims( m_p_pad, n_p_pad, *p ); + bli_obj_set_padded_dims( m_p_pad, n_p_pad, *p ); // Now we prepare to compute strides, align them, and compute the // total number of bytes needed for the packed buffer. After that, @@ -294,13 +305,13 @@ void bli_packm_init_pack( bool_t densify, dim_t ps_p; // The maximum panel length (for each datatype) should be equal to - // the m dimension multiple. - m_panel = mult_m_dim; + // the register blocksize in the m dimension. + m_panel = mr_def_dim; // The "column stride" of a row panel packed object is interpreted as // the column stride WITHIN a panel. Thus, this is equal to the panel - // length. - cs_p = m_panel; + // dimension plus an extension (which may be zero). + cs_p = mr_pack_dim; // The "row stride" of a row panel packed object is interpreted // as the row stride WITHIN a panel. Thus, it is unit. @@ -319,8 +330,9 @@ void bli_packm_init_pack( bool_t densify, ps_p = bli_align_dim_to_size( ps_p, elem_size_p, BLIS_CONTIG_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); + bli_obj_set_panel_dim( m_panel, *p ); bli_obj_set_panel_stride( ps_p, *p ); // Compute the size of the packed buffer. @@ -332,13 +344,13 @@ void bli_packm_init_pack( bool_t densify, dim_t ps_p; // The maximum panel width (for each datatype) should be equal to - // the n dimension multiple. - n_panel = mult_n_dim; + // the register blocksize in the n dimension. + n_panel = nr_def_dim; // The "row stride" of a column panel packed object is interpreted as - // the row stride WITHIN a panel. Thus, it is equal to the panel - // width. - rs_p = n_panel; + // the row stride WITHIN a panel. Thus, this is equal to the panel + // dimension plus an extension (which may be zero). + rs_p = nr_pack_dim; // The "column stride" of a column panel packed object is interpreted // as the column stride WITHIN a panel. Thus, it is unit. @@ -357,8 +369,9 @@ void bli_packm_init_pack( bool_t densify, ps_p = bli_align_dim_to_size( ps_p, elem_size_p, BLIS_CONTIG_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); + bli_obj_set_panel_dim( n_panel, *p ); bli_obj_set_panel_stride( ps_p, *p ); // Compute the size of the packed buffer. diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 98553af66..660828c2f 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -42,8 +42,10 @@ void bli_packm_init_pack( bool_t densify, packord_t pack_ord_if_up, packord_t pack_ord_if_lo, packbuf_t pack_buf_type, - blksz_t* mult_m, - blksz_t* mult_n, + blksz_t* mr_def, + blksz_t* mr_ext, + blksz_t* nr_def, + blksz_t* nr_ext, obj_t* c, obj_t* p ); diff --git a/frame/1m/packm/bli_packm_part.c b/frame/1m/packm/bli_packm_part.c index 84b60675d..966184afb 100644 --- a/frame/1m/packm/bli_packm_part.c +++ b/frame/1m/packm/bli_packm_part.c @@ -76,7 +76,7 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, // Modify offsets and dimensions of requested partition. bli_obj_set_dims( b, n, *sub_obj ); - // Tweak the packed length of the subpartition to trick the underlying + // Tweak the padded length of the subpartition to trick the underlying // implementation into only zero-padding for the narrow submatrix of // interest. Usually, the value we want is b (for non-edge cases), but // at the edges, we want the remainder of the mem_t region in the m @@ -86,13 +86,13 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, // b for the edge iteration). In these cases, we arrive at the new // packed length by simply subtracting off i. { - dim_t m_pack_max = bli_obj_packed_length( *sub_obj ); + dim_t m_pack_max = bli_obj_padded_length( *sub_obj ); dim_t m_pack_cur; if ( i + b == m ) m_pack_cur = m_pack_max - i; else m_pack_cur = b; - bli_obj_set_packed_length( m_pack_cur, *sub_obj ); + bli_obj_set_padded_length( m_pack_cur, *sub_obj ); } // Translate the desired offsets to a panel offset and adjust the @@ -152,7 +152,7 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part, // Modify offsets and dimensions of requested partition. bli_obj_set_dims( m, b, *sub_obj ); - // Tweak the packed width of the subpartition to trick the underlying + // Tweak the padded width of the subpartition to trick the underlying // implementation into only zero-padding for the narrow submatrix of // interest. Usually, the value we want is b (for non-edge cases), but // at the edges, we want the remainder of the mem_t region in the n @@ -162,13 +162,13 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part, // b for the edge iteration). In these cases, we arrive at the new // packed width by simply subtracting off j. { - dim_t n_pack_max = bli_obj_packed_width( *sub_obj ); + dim_t n_pack_max = bli_obj_padded_width( *sub_obj ); dim_t n_pack_cur; if ( j + b == n ) n_pack_cur = n_pack_max - j; else n_pack_cur = b; - bli_obj_set_packed_width( n_pack_cur, *sub_obj ); + bli_obj_set_padded_width( n_pack_cur, *sub_obj ); } // Translate the desired offsets to a panel offset and adjust the diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c index 1cb29fb5c..be0627f39 100644 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ b/frame/1m/packm/bli_packm_unb_var1.c @@ -70,8 +70,8 @@ void bli_packm_unb_var1( obj_t* beta, dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); - dim_t m_max_p = bli_obj_packed_length( *p ); - dim_t n_max_p = bli_obj_packed_width( *p ); + dim_t m_max_p = bli_obj_padded_length( *p ); + dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); diff --git a/frame/1m/packm/old/bli_packm_blk_var1.c b/frame/1m/packm/old/bli_packm_blk_var1.c index 33a1ea265..7a0d3bfae 100644 --- a/frame/1m/packm/old/bli_packm_blk_var1.c +++ b/frame/1m/packm/old/bli_packm_blk_var1.c @@ -71,8 +71,8 @@ void bli_packm_blk_var1( obj_t* beta, dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); - dim_t m_max_p = bli_obj_packed_length( *p ); - dim_t n_max_p = bli_obj_packed_width( *p ); + dim_t m_max_p = bli_obj_padded_length( *p ); + dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 286f1e956..8ed26fb18 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -54,87 +54,64 @@ blksz_t* gemm_kc; blksz_t* gemm_mr; blksz_t* gemm_nr; blksz_t* gemm_kr; +blksz_t* gemm_extmr; +blksz_t* gemm_extnr; +blksz_t* gemm_extkr; blksz_t* gemm_ni; -// Cache blocksizes. - -#define BLIS_GEMM_KC_S BLIS_DEFAULT_KC_S -#define BLIS_GEMM_KC_D BLIS_DEFAULT_KC_D -#define BLIS_GEMM_KC_C BLIS_DEFAULT_KC_C -#define BLIS_GEMM_KC_Z BLIS_DEFAULT_KC_Z - -#define BLIS_GEMM_MC_S BLIS_DEFAULT_MC_S -#define BLIS_GEMM_MC_D BLIS_DEFAULT_MC_D -#define BLIS_GEMM_MC_C BLIS_DEFAULT_MC_C -#define BLIS_GEMM_MC_Z BLIS_DEFAULT_MC_Z - -#define BLIS_GEMM_NC_S BLIS_DEFAULT_NC_S -#define BLIS_GEMM_NC_D BLIS_DEFAULT_NC_D -#define BLIS_GEMM_NC_C BLIS_DEFAULT_NC_C -#define BLIS_GEMM_NC_Z BLIS_DEFAULT_NC_Z - -// Register blocking - -#define BLIS_GEMM_KR_S BLIS_DEFAULT_KR_S -#define BLIS_GEMM_KR_D BLIS_DEFAULT_KR_D -#define BLIS_GEMM_KR_C BLIS_DEFAULT_KR_C -#define BLIS_GEMM_KR_Z BLIS_DEFAULT_KR_Z - -#define BLIS_GEMM_MR_S BLIS_DEFAULT_MR_S -#define BLIS_GEMM_MR_D BLIS_DEFAULT_MR_D -#define BLIS_GEMM_MR_C BLIS_DEFAULT_MR_C -#define BLIS_GEMM_MR_Z BLIS_DEFAULT_MR_Z - -#define BLIS_GEMM_NR_S BLIS_DEFAULT_NR_S -#define BLIS_GEMM_NR_D BLIS_DEFAULT_NR_D -#define BLIS_GEMM_NR_C BLIS_DEFAULT_NR_C -#define BLIS_GEMM_NR_Z BLIS_DEFAULT_NR_Z - -// Incremental pack blocking - -#define BLIS_GEMM_NI_S BLIS_DEFAULT_NI_S -#define BLIS_GEMM_NI_D BLIS_DEFAULT_NI_D -#define BLIS_GEMM_NI_C BLIS_DEFAULT_NI_C -#define BLIS_GEMM_NI_Z BLIS_DEFAULT_NI_Z - void bli_gemm_cntl_init() { // Create blocksize objects for each dimension. - gemm_mc = bli_blksz_obj_create( BLIS_GEMM_MC_S, - BLIS_GEMM_MC_D, - BLIS_GEMM_MC_C, - BLIS_GEMM_MC_Z ); + gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, + BLIS_DEFAULT_MC_D, + BLIS_DEFAULT_MC_C, + BLIS_DEFAULT_MC_Z ); - gemm_nc = bli_blksz_obj_create( BLIS_GEMM_NC_S, - BLIS_GEMM_NC_D, - BLIS_GEMM_NC_C, - BLIS_GEMM_NC_Z ); + gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, + BLIS_DEFAULT_NC_D, + BLIS_DEFAULT_NC_C, + BLIS_DEFAULT_NC_Z ); - gemm_kc = bli_blksz_obj_create( BLIS_GEMM_KC_S, - BLIS_GEMM_KC_D, - BLIS_GEMM_KC_C, - BLIS_GEMM_KC_Z ); + gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, + BLIS_DEFAULT_KC_D, + BLIS_DEFAULT_KC_C, + BLIS_DEFAULT_KC_Z ); - gemm_mr = bli_blksz_obj_create( BLIS_GEMM_MR_S, - BLIS_GEMM_MR_D, - BLIS_GEMM_MR_C, - BLIS_GEMM_MR_Z ); + gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, + BLIS_DEFAULT_MR_D, + BLIS_DEFAULT_MR_C, + BLIS_DEFAULT_MR_Z ); - gemm_nr = bli_blksz_obj_create( BLIS_GEMM_NR_S, - BLIS_GEMM_NR_D, - BLIS_GEMM_NR_C, - BLIS_GEMM_NR_Z ); + gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, + BLIS_DEFAULT_NR_D, + BLIS_DEFAULT_NR_C, + BLIS_DEFAULT_NR_Z ); - gemm_kr = bli_blksz_obj_create( BLIS_GEMM_KR_S, - BLIS_GEMM_KR_D, - BLIS_GEMM_KR_C, - BLIS_GEMM_KR_Z ); + gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, + BLIS_DEFAULT_KR_D, + BLIS_DEFAULT_KR_C, + BLIS_DEFAULT_KR_Z ); - gemm_ni = bli_blksz_obj_create( BLIS_GEMM_NI_S, - BLIS_GEMM_NI_D, - BLIS_GEMM_NI_C, - BLIS_GEMM_NI_Z ); + gemm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + + gemm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + + gemm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S, + BLIS_EXTEND_KR_D, + BLIS_EXTEND_KR_C, + BLIS_EXTEND_KR_Z ); + + gemm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); // Create control tree objects for packm operations on a, b, and c. @@ -142,8 +119,8 @@ void bli_gemm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - gemm_mr, - gemm_kr, + gemm_mr, gemm_extmr, + gemm_kr, gemm_extkr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -156,8 +133,8 @@ void bli_gemm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - gemm_kr, - gemm_nr, + gemm_kr, gemm_extkr, + gemm_nr, gemm_extnr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -170,8 +147,8 @@ void bli_gemm_cntl_init() = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, - gemm_mr, - gemm_nr, + gemm_mr, gemm_extmr, + gemm_nr, gemm_extnr, FALSE, // do NOT scale by beta FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 96727e9fd..fdb7d03e4 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -144,7 +144,7 @@ void PASTEMAC(ch,varname)( \ ) \ { \ /* Temporary buffer for duplicating elements of B. */ \ - ctype bd[ PASTEMAC(ch,kc) * \ + ctype bd[ PASTEMAC(ch,maxkc) * \ PASTEMAC(ch,nr) * \ PASTEMAC(ch,ndup) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ @@ -239,10 +239,15 @@ void PASTEMAC(ch,varname)( \ columns of B to a local buffer with each value duplicated. */ \ if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \ else bp = b1; \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ +/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ +\ /* Invoke the gemm micro-kernel. */ \ PASTEMAC(ch,ukrname)( k, \ alpha_cast, \ diff --git a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c b/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c index d0545bf4f..d7213278c 100644 --- a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c +++ b/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c @@ -47,22 +47,20 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c, inc_t rs_c, inc_t cs_c \ ) \ { \ - const dim_t MR = PASTEMAC(ch,mr); \ - const dim_t NR = PASTEMAC(ch,nr); \ + const dim_t m = PASTEMAC(ch,mr); \ + const dim_t n = PASTEMAC(ch,nr); \ \ - const dim_t m = MR; \ - const dim_t n = NR; \ + const inc_t cs_a = PASTEMAC(ch,packmr); \ \ - const inc_t cs_a = MR; \ -\ - const inc_t rs_b = NR; \ + const inc_t rs_b = PASTEMAC(ch,packnr); \ \ const inc_t rs_ab = 1; \ - const inc_t cs_ab = MR; \ + const inc_t cs_ab = PASTEMAC(ch,mr); \ \ dim_t k0, j0, i0; \ \ - ctype ab[ MR * NR ]; \ + ctype ab[ PASTEMAC(ch,mr) * \ + PASTEMAC(ch,nr) ]; \ ctype* restrict ab00; \ ctype a0; \ ctype b0; \ diff --git a/frame/3/hemm/bli_hemm_cntl.c b/frame/3/hemm/bli_hemm_cntl.c index 48b11fc96..2a1767649 100644 --- a/frame/3/hemm/bli_hemm_cntl.c +++ b/frame/3/hemm/bli_hemm_cntl.c @@ -54,87 +54,64 @@ blksz_t* hemm_kc; blksz_t* hemm_mr; blksz_t* hemm_nr; blksz_t* hemm_kr; +blksz_t* hemm_extmr; +blksz_t* hemm_extnr; +blksz_t* hemm_extkr; blksz_t* hemm_ni; -// Cache blocksizes. - -#define BLIS_HEMM_KC_S BLIS_DEFAULT_KC_S -#define BLIS_HEMM_KC_D BLIS_DEFAULT_KC_D -#define BLIS_HEMM_KC_C BLIS_DEFAULT_KC_C -#define BLIS_HEMM_KC_Z BLIS_DEFAULT_KC_Z - -#define BLIS_HEMM_MC_S BLIS_DEFAULT_MC_S -#define BLIS_HEMM_MC_D BLIS_DEFAULT_MC_D -#define BLIS_HEMM_MC_C BLIS_DEFAULT_MC_C -#define BLIS_HEMM_MC_Z BLIS_DEFAULT_MC_Z - -#define BLIS_HEMM_NC_S BLIS_DEFAULT_NC_S -#define BLIS_HEMM_NC_D BLIS_DEFAULT_NC_D -#define BLIS_HEMM_NC_C BLIS_DEFAULT_NC_C -#define BLIS_HEMM_NC_Z BLIS_DEFAULT_NC_Z - -// Register blocking - -#define BLIS_HEMM_KR_S BLIS_DEFAULT_KR_S -#define BLIS_HEMM_KR_D BLIS_DEFAULT_KR_D -#define BLIS_HEMM_KR_C BLIS_DEFAULT_KR_C -#define BLIS_HEMM_KR_Z BLIS_DEFAULT_KR_Z - -#define BLIS_HEMM_MR_S BLIS_DEFAULT_MR_S -#define BLIS_HEMM_MR_D BLIS_DEFAULT_MR_D -#define BLIS_HEMM_MR_C BLIS_DEFAULT_MR_C -#define BLIS_HEMM_MR_Z BLIS_DEFAULT_MR_Z - -#define BLIS_HEMM_NR_S BLIS_DEFAULT_NR_S -#define BLIS_HEMM_NR_D BLIS_DEFAULT_NR_D -#define BLIS_HEMM_NR_C BLIS_DEFAULT_NR_C -#define BLIS_HEMM_NR_Z BLIS_DEFAULT_NR_Z - -// Incremental pack blocking - -#define BLIS_HEMM_NI_S BLIS_DEFAULT_NI_S -#define BLIS_HEMM_NI_D BLIS_DEFAULT_NI_D -#define BLIS_HEMM_NI_C BLIS_DEFAULT_NI_C -#define BLIS_HEMM_NI_Z BLIS_DEFAULT_NI_Z - void bli_hemm_cntl_init() { // Create blocksize objects for each dimension. - hemm_mc = bli_blksz_obj_create( BLIS_HEMM_MC_S, - BLIS_HEMM_MC_D, - BLIS_HEMM_MC_C, - BLIS_HEMM_MC_Z ); + hemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, + BLIS_DEFAULT_MC_D, + BLIS_DEFAULT_MC_C, + BLIS_DEFAULT_MC_Z ); - hemm_nc = bli_blksz_obj_create( BLIS_HEMM_NC_S, - BLIS_HEMM_NC_D, - BLIS_HEMM_NC_C, - BLIS_HEMM_NC_Z ); + hemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, + BLIS_DEFAULT_NC_D, + BLIS_DEFAULT_NC_C, + BLIS_DEFAULT_NC_Z ); - hemm_kc = bli_blksz_obj_create( BLIS_HEMM_KC_S, - BLIS_HEMM_KC_D, - BLIS_HEMM_KC_C, - BLIS_HEMM_KC_Z ); + hemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, + BLIS_DEFAULT_KC_D, + BLIS_DEFAULT_KC_C, + BLIS_DEFAULT_KC_Z ); - hemm_mr = bli_blksz_obj_create( BLIS_HEMM_MR_S, - BLIS_HEMM_MR_D, - BLIS_HEMM_MR_C, - BLIS_HEMM_MR_Z ); + hemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, + BLIS_DEFAULT_MR_D, + BLIS_DEFAULT_MR_C, + BLIS_DEFAULT_MR_Z ); - hemm_nr = bli_blksz_obj_create( BLIS_HEMM_NR_S, - BLIS_HEMM_NR_D, - BLIS_HEMM_NR_C, - BLIS_HEMM_NR_Z ); + hemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, + BLIS_DEFAULT_NR_D, + BLIS_DEFAULT_NR_C, + BLIS_DEFAULT_NR_Z ); - hemm_kr = bli_blksz_obj_create( BLIS_HEMM_KR_S, - BLIS_HEMM_KR_D, - BLIS_HEMM_KR_C, - BLIS_HEMM_KR_Z ); + hemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, + BLIS_DEFAULT_KR_D, + BLIS_DEFAULT_KR_C, + BLIS_DEFAULT_KR_Z ); - hemm_ni = bli_blksz_obj_create( BLIS_HEMM_NI_S, - BLIS_HEMM_NI_D, - BLIS_HEMM_NI_C, - BLIS_HEMM_NI_Z ); + hemm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + + hemm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + + hemm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S, + BLIS_EXTEND_KR_D, + BLIS_EXTEND_KR_C, + BLIS_EXTEND_KR_Z ); + + hemm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); // Create control tree objects for packm operations on a, b, and c. @@ -142,8 +119,8 @@ void bli_hemm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - hemm_mr, - hemm_kr, + hemm_mr, hemm_extmr, + hemm_kr, hemm_extkr, FALSE, // do NOT scale by alpha TRUE, // densify FALSE, // do NOT invert diagonal @@ -156,8 +133,8 @@ void bli_hemm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - hemm_kr, - hemm_nr, + hemm_kr, hemm_extkr, + hemm_nr, hemm_extnr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -170,8 +147,8 @@ void bli_hemm_cntl_init() = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, - hemm_mr, - hemm_nr, + hemm_mr, hemm_extmr, + hemm_nr, hemm_extnr, FALSE, // do NOT scale by beta FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal diff --git a/frame/3/her2k/bli_her2k_cntl.c b/frame/3/her2k/bli_her2k_cntl.c index c45e09fb5..186eebf4a 100644 --- a/frame/3/her2k/bli_her2k_cntl.c +++ b/frame/3/her2k/bli_her2k_cntl.c @@ -55,87 +55,64 @@ blksz_t* her2k_kc; blksz_t* her2k_mr; blksz_t* her2k_nr; blksz_t* her2k_kr; +blksz_t* her2k_extmr; +blksz_t* her2k_extnr; +blksz_t* her2k_extkr; blksz_t* her2k_ni; -// Cache blocksizes. - -#define BLIS_HER2K_KC_S BLIS_DEFAULT_KC_S -#define BLIS_HER2K_KC_D BLIS_DEFAULT_KC_D -#define BLIS_HER2K_KC_C BLIS_DEFAULT_KC_C -#define BLIS_HER2K_KC_Z BLIS_DEFAULT_KC_Z - -#define BLIS_HER2K_MC_S BLIS_DEFAULT_MC_S -#define BLIS_HER2K_MC_D BLIS_DEFAULT_MC_D -#define BLIS_HER2K_MC_C BLIS_DEFAULT_MC_C -#define BLIS_HER2K_MC_Z BLIS_DEFAULT_MC_Z - -#define BLIS_HER2K_NC_S BLIS_DEFAULT_NC_S -#define BLIS_HER2K_NC_D BLIS_DEFAULT_NC_D -#define BLIS_HER2K_NC_C BLIS_DEFAULT_NC_C -#define BLIS_HER2K_NC_Z BLIS_DEFAULT_NC_Z - -// Register blocking - -#define BLIS_HER2K_KR_S BLIS_DEFAULT_KR_S -#define BLIS_HER2K_KR_D BLIS_DEFAULT_KR_D -#define BLIS_HER2K_KR_C BLIS_DEFAULT_KR_C -#define BLIS_HER2K_KR_Z BLIS_DEFAULT_KR_Z - -#define BLIS_HER2K_MR_S BLIS_DEFAULT_MR_S -#define BLIS_HER2K_MR_D BLIS_DEFAULT_MR_D -#define BLIS_HER2K_MR_C BLIS_DEFAULT_MR_C -#define BLIS_HER2K_MR_Z BLIS_DEFAULT_MR_Z - -#define BLIS_HER2K_NR_S BLIS_DEFAULT_NR_S -#define BLIS_HER2K_NR_D BLIS_DEFAULT_NR_D -#define BLIS_HER2K_NR_C BLIS_DEFAULT_NR_C -#define BLIS_HER2K_NR_Z BLIS_DEFAULT_NR_Z - -// Incremental pack blocking - -#define BLIS_HER2K_NI_S BLIS_DEFAULT_NI_S -#define BLIS_HER2K_NI_D BLIS_DEFAULT_NI_D -#define BLIS_HER2K_NI_C BLIS_DEFAULT_NI_C -#define BLIS_HER2K_NI_Z BLIS_DEFAULT_NI_Z - void bli_her2k_cntl_init() { // Create blocksize objects for each dimension. - her2k_mc = bli_blksz_obj_create( BLIS_HER2K_MC_S, - BLIS_HER2K_MC_D, - BLIS_HER2K_MC_C, - BLIS_HER2K_MC_Z ); + her2k_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, + BLIS_DEFAULT_MC_D, + BLIS_DEFAULT_MC_C, + BLIS_DEFAULT_MC_Z ); - her2k_nc = bli_blksz_obj_create( BLIS_HER2K_NC_S, - BLIS_HER2K_NC_D, - BLIS_HER2K_NC_C, - BLIS_HER2K_NC_Z ); + her2k_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, + BLIS_DEFAULT_NC_D, + BLIS_DEFAULT_NC_C, + BLIS_DEFAULT_NC_Z ); - her2k_kc = bli_blksz_obj_create( BLIS_HER2K_KC_S, - BLIS_HER2K_KC_D, - BLIS_HER2K_KC_C, - BLIS_HER2K_KC_Z ); + her2k_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, + BLIS_DEFAULT_KC_D, + BLIS_DEFAULT_KC_C, + BLIS_DEFAULT_KC_Z ); - her2k_mr = bli_blksz_obj_create( BLIS_HER2K_MR_S, - BLIS_HER2K_MR_D, - BLIS_HER2K_MR_C, - BLIS_HER2K_MR_Z ); + her2k_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, + BLIS_DEFAULT_MR_D, + BLIS_DEFAULT_MR_C, + BLIS_DEFAULT_MR_Z ); - her2k_nr = bli_blksz_obj_create( BLIS_HER2K_NR_S, - BLIS_HER2K_NR_D, - BLIS_HER2K_NR_C, - BLIS_HER2K_NR_Z ); + her2k_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, + BLIS_DEFAULT_NR_D, + BLIS_DEFAULT_NR_C, + BLIS_DEFAULT_NR_Z ); - her2k_kr = bli_blksz_obj_create( BLIS_HER2K_KR_S, - BLIS_HER2K_KR_D, - BLIS_HER2K_KR_C, - BLIS_HER2K_KR_Z ); + her2k_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, + BLIS_DEFAULT_KR_D, + BLIS_DEFAULT_KR_C, + BLIS_DEFAULT_KR_Z ); - her2k_ni = bli_blksz_obj_create( BLIS_HER2K_NI_S, - BLIS_HER2K_NI_D, - BLIS_HER2K_NI_C, - BLIS_HER2K_NI_Z ); + her2k_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + + her2k_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + + her2k_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S, + BLIS_EXTEND_KR_D, + BLIS_EXTEND_KR_C, + BLIS_EXTEND_KR_Z ); + + her2k_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); // Create control tree objects for packm operations on a, b, and c. @@ -143,8 +120,8 @@ void bli_her2k_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - her2k_mr, - her2k_kr, + her2k_mr, her2k_extmr, + her2k_kr, her2k_extkr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -157,8 +134,8 @@ void bli_her2k_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - her2k_kr, - her2k_nr, + her2k_kr, her2k_extkr, + her2k_nr, her2k_extnr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -171,8 +148,8 @@ void bli_her2k_cntl_init() = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, - her2k_mr, - her2k_nr, + her2k_mr, her2k_extmr, + her2k_nr, her2k_extnr, FALSE, // do NOT scale by beta FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal diff --git a/frame/3/herk/bli_herk_cntl.c b/frame/3/herk/bli_herk_cntl.c index bbf9014aa..3ff2ae63c 100644 --- a/frame/3/herk/bli_herk_cntl.c +++ b/frame/3/herk/bli_herk_cntl.c @@ -54,87 +54,64 @@ blksz_t* herk_kc; blksz_t* herk_mr; blksz_t* herk_nr; blksz_t* herk_kr; +blksz_t* herk_extmr; +blksz_t* herk_extnr; +blksz_t* herk_extkr; blksz_t* herk_ni; -// Cache blocksizes. - -#define BLIS_HERK_KC_S BLIS_DEFAULT_KC_S -#define BLIS_HERK_KC_D BLIS_DEFAULT_KC_D -#define BLIS_HERK_KC_C BLIS_DEFAULT_KC_C -#define BLIS_HERK_KC_Z BLIS_DEFAULT_KC_Z - -#define BLIS_HERK_MC_S BLIS_DEFAULT_MC_S -#define BLIS_HERK_MC_D BLIS_DEFAULT_MC_D -#define BLIS_HERK_MC_C BLIS_DEFAULT_MC_C -#define BLIS_HERK_MC_Z BLIS_DEFAULT_MC_Z - -#define BLIS_HERK_NC_S BLIS_DEFAULT_NC_S -#define BLIS_HERK_NC_D BLIS_DEFAULT_NC_D -#define BLIS_HERK_NC_C BLIS_DEFAULT_NC_C -#define BLIS_HERK_NC_Z BLIS_DEFAULT_NC_Z - -// Register blocking - -#define BLIS_HERK_KR_S BLIS_DEFAULT_KR_S -#define BLIS_HERK_KR_D BLIS_DEFAULT_KR_D -#define BLIS_HERK_KR_C BLIS_DEFAULT_KR_C -#define BLIS_HERK_KR_Z BLIS_DEFAULT_KR_Z - -#define BLIS_HERK_MR_S BLIS_DEFAULT_MR_S -#define BLIS_HERK_MR_D BLIS_DEFAULT_MR_D -#define BLIS_HERK_MR_C BLIS_DEFAULT_MR_C -#define BLIS_HERK_MR_Z BLIS_DEFAULT_MR_Z - -#define BLIS_HERK_NR_S BLIS_DEFAULT_NR_S -#define BLIS_HERK_NR_D BLIS_DEFAULT_NR_D -#define BLIS_HERK_NR_C BLIS_DEFAULT_NR_C -#define BLIS_HERK_NR_Z BLIS_DEFAULT_NR_Z - -// Incremental pack blocking - -#define BLIS_HERK_NI_S BLIS_DEFAULT_NI_S -#define BLIS_HERK_NI_D BLIS_DEFAULT_NI_D -#define BLIS_HERK_NI_C BLIS_DEFAULT_NI_C -#define BLIS_HERK_NI_Z BLIS_DEFAULT_NI_Z - void bli_herk_cntl_init() { // Create blocksize objects for each dimension. - herk_mc = bli_blksz_obj_create( BLIS_HERK_MC_S, - BLIS_HERK_MC_D, - BLIS_HERK_MC_C, - BLIS_HERK_MC_Z ); + herk_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, + BLIS_DEFAULT_MC_D, + BLIS_DEFAULT_MC_C, + BLIS_DEFAULT_MC_Z ); - herk_nc = bli_blksz_obj_create( BLIS_HERK_NC_S, - BLIS_HERK_NC_D, - BLIS_HERK_NC_C, - BLIS_HERK_NC_Z ); + herk_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, + BLIS_DEFAULT_NC_D, + BLIS_DEFAULT_NC_C, + BLIS_DEFAULT_NC_Z ); - herk_kc = bli_blksz_obj_create( BLIS_HERK_KC_S, - BLIS_HERK_KC_D, - BLIS_HERK_KC_C, - BLIS_HERK_KC_Z ); + herk_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, + BLIS_DEFAULT_KC_D, + BLIS_DEFAULT_KC_C, + BLIS_DEFAULT_KC_Z ); - herk_mr = bli_blksz_obj_create( BLIS_HERK_MR_S, - BLIS_HERK_MR_D, - BLIS_HERK_MR_C, - BLIS_HERK_MR_Z ); + herk_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, + BLIS_DEFAULT_MR_D, + BLIS_DEFAULT_MR_C, + BLIS_DEFAULT_MR_Z ); - herk_nr = bli_blksz_obj_create( BLIS_HERK_NR_S, - BLIS_HERK_NR_D, - BLIS_HERK_NR_C, - BLIS_HERK_NR_Z ); + herk_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, + BLIS_DEFAULT_NR_D, + BLIS_DEFAULT_NR_C, + BLIS_DEFAULT_NR_Z ); - herk_kr = bli_blksz_obj_create( BLIS_HERK_KR_S, - BLIS_HERK_KR_D, - BLIS_HERK_KR_C, - BLIS_HERK_KR_Z ); + herk_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, + BLIS_DEFAULT_KR_D, + BLIS_DEFAULT_KR_C, + BLIS_DEFAULT_KR_Z ); - herk_ni = bli_blksz_obj_create( BLIS_HERK_NI_S, - BLIS_HERK_NI_D, - BLIS_HERK_NI_C, - BLIS_HERK_NI_Z ); + herk_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + + herk_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + + herk_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S, + BLIS_EXTEND_KR_D, + BLIS_EXTEND_KR_C, + BLIS_EXTEND_KR_Z ); + + herk_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); // Create control tree objects for packm operations on a, b, and c. @@ -142,8 +119,8 @@ void bli_herk_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - herk_mr, - herk_kr, + herk_mr, herk_extmr, + herk_kr, herk_extkr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -156,8 +133,8 @@ void bli_herk_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - herk_kr, - herk_nr, + herk_kr, herk_extkr, + herk_nr, herk_extnr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -170,8 +147,8 @@ void bli_herk_cntl_init() = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, - herk_mr, - herk_nr, + herk_mr, herk_extmr, + herk_nr, herk_extnr, FALSE, // do NOT scale by beta FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 4b47aecc8..ac187e076 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -148,7 +148,7 @@ void PASTEMAC(ch,varname)( \ ) \ { \ /* Temporary buffer for duplicating elements of B. */ \ - ctype bd[ PASTEMAC(ch,kc) * \ + ctype bd[ PASTEMAC(ch,maxkc) * \ PASTEMAC(ch,nr) * \ PASTEMAC(ch,ndup) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 093eebbb4..26bb1d904 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -148,7 +148,7 @@ void PASTEMAC(ch,varname)( \ ) \ { \ /* Temporary buffer for duplicating elements of B. */ \ - ctype bd[ PASTEMAC(ch,kc) * \ + ctype bd[ PASTEMAC(ch,maxkc) * \ PASTEMAC(ch,nr) * \ PASTEMAC(ch,ndup) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ diff --git a/frame/3/trmm/bli_trmm_cntl.c b/frame/3/trmm/bli_trmm_cntl.c index d5e419839..002df204f 100644 --- a/frame/3/trmm/bli_trmm_cntl.c +++ b/frame/3/trmm/bli_trmm_cntl.c @@ -55,87 +55,64 @@ blksz_t* trmm_kc; blksz_t* trmm_mr; blksz_t* trmm_nr; blksz_t* trmm_kr; +blksz_t* trmm_extmr; +blksz_t* trmm_extnr; +blksz_t* trmm_extkr; blksz_t* trmm_ni; -// Cache blocksizes. - -#define BLIS_TRMM_KC_S BLIS_DEFAULT_KC_S -#define BLIS_TRMM_KC_D BLIS_DEFAULT_KC_D -#define BLIS_TRMM_KC_C BLIS_DEFAULT_KC_C -#define BLIS_TRMM_KC_Z BLIS_DEFAULT_KC_Z - -#define BLIS_TRMM_MC_S BLIS_DEFAULT_MC_S -#define BLIS_TRMM_MC_D BLIS_DEFAULT_MC_D -#define BLIS_TRMM_MC_C BLIS_DEFAULT_MC_C -#define BLIS_TRMM_MC_Z BLIS_DEFAULT_MC_Z - -#define BLIS_TRMM_NC_S BLIS_DEFAULT_NC_S -#define BLIS_TRMM_NC_D BLIS_DEFAULT_NC_D -#define BLIS_TRMM_NC_C BLIS_DEFAULT_NC_C -#define BLIS_TRMM_NC_Z BLIS_DEFAULT_NC_Z - -// Register blocking - -#define BLIS_TRMM_KR_S BLIS_DEFAULT_KR_S -#define BLIS_TRMM_KR_D BLIS_DEFAULT_KR_D -#define BLIS_TRMM_KR_C BLIS_DEFAULT_KR_C -#define BLIS_TRMM_KR_Z BLIS_DEFAULT_KR_Z - -#define BLIS_TRMM_MR_S BLIS_DEFAULT_MR_S -#define BLIS_TRMM_MR_D BLIS_DEFAULT_MR_D -#define BLIS_TRMM_MR_C BLIS_DEFAULT_MR_C -#define BLIS_TRMM_MR_Z BLIS_DEFAULT_MR_Z - -#define BLIS_TRMM_NR_S BLIS_DEFAULT_NR_S -#define BLIS_TRMM_NR_D BLIS_DEFAULT_NR_D -#define BLIS_TRMM_NR_C BLIS_DEFAULT_NR_C -#define BLIS_TRMM_NR_Z BLIS_DEFAULT_NR_Z - -// Incremental pack blocking - -#define BLIS_TRMM_NI_S BLIS_DEFAULT_NI_S -#define BLIS_TRMM_NI_D BLIS_DEFAULT_NI_D -#define BLIS_TRMM_NI_C BLIS_DEFAULT_NI_C -#define BLIS_TRMM_NI_Z BLIS_DEFAULT_NI_Z - void bli_trmm_cntl_init() { // Create blocksize objects for each dimension. - trmm_mc = bli_blksz_obj_create( BLIS_TRMM_MC_S, - BLIS_TRMM_MC_D, - BLIS_TRMM_MC_C, - BLIS_TRMM_MC_Z ); + trmm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, + BLIS_DEFAULT_MC_D, + BLIS_DEFAULT_MC_C, + BLIS_DEFAULT_MC_Z ); - trmm_nc = bli_blksz_obj_create( BLIS_TRMM_NC_S, - BLIS_TRMM_NC_D, - BLIS_TRMM_NC_C, - BLIS_TRMM_NC_Z ); + trmm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, + BLIS_DEFAULT_NC_D, + BLIS_DEFAULT_NC_C, + BLIS_DEFAULT_NC_Z ); - trmm_kc = bli_blksz_obj_create( BLIS_TRMM_KC_S, - BLIS_TRMM_KC_D, - BLIS_TRMM_KC_C, - BLIS_TRMM_KC_Z ); + trmm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, + BLIS_DEFAULT_KC_D, + BLIS_DEFAULT_KC_C, + BLIS_DEFAULT_KC_Z ); - trmm_mr = bli_blksz_obj_create( BLIS_TRMM_MR_S, - BLIS_TRMM_MR_D, - BLIS_TRMM_MR_C, - BLIS_TRMM_MR_Z ); + trmm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, + BLIS_DEFAULT_MR_D, + BLIS_DEFAULT_MR_C, + BLIS_DEFAULT_MR_Z ); - trmm_nr = bli_blksz_obj_create( BLIS_TRMM_NR_S, - BLIS_TRMM_NR_D, - BLIS_TRMM_NR_C, - BLIS_TRMM_NR_Z ); + trmm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, + BLIS_DEFAULT_NR_D, + BLIS_DEFAULT_NR_C, + BLIS_DEFAULT_NR_Z ); - trmm_kr = bli_blksz_obj_create( BLIS_TRMM_KR_S, - BLIS_TRMM_KR_D, - BLIS_TRMM_KR_C, - BLIS_TRMM_KR_Z ); + trmm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, + BLIS_DEFAULT_KR_D, + BLIS_DEFAULT_KR_C, + BLIS_DEFAULT_KR_Z ); - trmm_ni = bli_blksz_obj_create( BLIS_TRMM_NI_S, - BLIS_TRMM_NI_D, - BLIS_TRMM_NI_C, - BLIS_TRMM_NI_Z ); + trmm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + + trmm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + + trmm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S, + BLIS_EXTEND_KR_D, + BLIS_EXTEND_KR_C, + BLIS_EXTEND_KR_Z ); + + trmm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); // Create control tree objects for packm operations on a, b, and c. @@ -143,8 +120,10 @@ void bli_trmm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, // pack panels of A compactly - trmm_mr, // IMPORTANT: for consistency with trsm, "k" dim - trmm_mr, // multiple is set to mr. + // IMPORTANT: for consistency with trsm, "k" dim + // multiple is set to mr. + trmm_mr, trmm_extmr, + trmm_mr, trmm_extmr, FALSE, // do NOT scale by alpha TRUE, // densify FALSE, // do NOT invert diagonal @@ -157,8 +136,10 @@ void bli_trmm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - trmm_mr, // IMPORTANT: m dim multiple here must be mr - trmm_nr, // since "k" dim multiple is set to mr above. + // IMPORTANT: m dim multiple here must be mr + // since "k" dim multiple is set to mr above. + trmm_mr, trmm_extmr, + trmm_nr, trmm_extnr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -171,8 +152,8 @@ void bli_trmm_cntl_init() = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, - trmm_mr, - trmm_nr, + trmm_mr, trmm_extmr, + trmm_nr, trmm_extmr, FALSE, // do NOT scale by beta FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal diff --git a/frame/3/trmm/bli_trmm_l_ker_var2.c b/frame/3/trmm/bli_trmm_l_ker_var2.c index bb5e6d26b..49877f600 100644 --- a/frame/3/trmm/bli_trmm_l_ker_var2.c +++ b/frame/3/trmm/bli_trmm_l_ker_var2.c @@ -149,7 +149,7 @@ void PASTEMAC(ch,varname)( \ ) \ { \ /* Temporary buffer for duplicating elements of B. */ \ - ctype bd[ PASTEMAC(ch,kc) * \ + ctype bd[ PASTEMAC(ch,maxkc) * \ PASTEMAC(ch,nr) * \ PASTEMAC(ch,ndup) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ @@ -165,6 +165,7 @@ void PASTEMAC(ch,varname)( \ /* Alias some constants to shorter names. */ \ const dim_t MR = PASTEMAC(ch,mr); \ const dim_t NR = PASTEMAC(ch,nr); \ + const dim_t PACKMR = PASTEMAC(ch,packmr); \ const dim_t NDUP = PASTEMAC(ch,ndup); \ const bool_t DUPB = NDUP != 1; \ \ @@ -252,7 +253,7 @@ void PASTEMAC(ch,varname)( \ k_nr = k_a1011 * NR; \ \ /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = k * MR; \ + rstep_a = k * PACKMR; \ \ cstep_b = ps_b; \ \ @@ -334,7 +335,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1011 * MR; \ + a1 += k_a1011 * PACKMR; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ diff --git a/frame/3/trmm/bli_trmm_u_ker_var2.c b/frame/3/trmm/bli_trmm_u_ker_var2.c index 506eee14e..c42b0ac88 100644 --- a/frame/3/trmm/bli_trmm_u_ker_var2.c +++ b/frame/3/trmm/bli_trmm_u_ker_var2.c @@ -149,7 +149,7 @@ void PASTEMAC(ch,varname)( \ ) \ { \ /* Temporary buffer for duplicating elements of B. */ \ - ctype bd[ PASTEMAC(ch,kc) * \ + ctype bd[ PASTEMAC(ch,maxkc) * \ PASTEMAC(ch,nr) * \ PASTEMAC(ch,ndup) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ @@ -165,6 +165,7 @@ void PASTEMAC(ch,varname)( \ /* Alias some constants to shorter names. */ \ const dim_t MR = PASTEMAC(ch,mr); \ const dim_t NR = PASTEMAC(ch,nr); \ + const dim_t PACKMR = PASTEMAC(ch,packmr); \ const dim_t NDUP = PASTEMAC(ch,ndup); \ const bool_t DUPB = NDUP != 1; \ \ @@ -252,7 +253,7 @@ void PASTEMAC(ch,varname)( \ k_nr = k_a1112 * NR; \ \ /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = k * MR; \ + rstep_a = k * PACKMR; \ \ cstep_b = ps_b; \ \ @@ -337,7 +338,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1112 * MR; \ + a1 += k_a1112 * PACKMR; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ diff --git a/frame/3/trmm3/bli_trmm3_cntl.c b/frame/3/trmm3/bli_trmm3_cntl.c index af8c9067d..5b80c0f3d 100644 --- a/frame/3/trmm3/bli_trmm3_cntl.c +++ b/frame/3/trmm3/bli_trmm3_cntl.c @@ -55,87 +55,64 @@ blksz_t* trmm3_kc; blksz_t* trmm3_mr; blksz_t* trmm3_nr; blksz_t* trmm3_kr; +blksz_t* trmm3_extmr; +blksz_t* trmm3_extnr; +blksz_t* trmm3_extkr; blksz_t* trmm3_ni; -// Cache blocksizes. - -#define BLIS_TRMM3_KC_S BLIS_DEFAULT_KC_S -#define BLIS_TRMM3_KC_D BLIS_DEFAULT_KC_D -#define BLIS_TRMM3_KC_C BLIS_DEFAULT_KC_C -#define BLIS_TRMM3_KC_Z BLIS_DEFAULT_KC_Z - -#define BLIS_TRMM3_MC_S BLIS_DEFAULT_MC_S -#define BLIS_TRMM3_MC_D BLIS_DEFAULT_MC_D -#define BLIS_TRMM3_MC_C BLIS_DEFAULT_MC_C -#define BLIS_TRMM3_MC_Z BLIS_DEFAULT_MC_Z - -#define BLIS_TRMM3_NC_S BLIS_DEFAULT_NC_S -#define BLIS_TRMM3_NC_D BLIS_DEFAULT_NC_D -#define BLIS_TRMM3_NC_C BLIS_DEFAULT_NC_C -#define BLIS_TRMM3_NC_Z BLIS_DEFAULT_NC_Z - -// Register blocking - -#define BLIS_TRMM3_KR_S BLIS_DEFAULT_KR_S -#define BLIS_TRMM3_KR_D BLIS_DEFAULT_KR_D -#define BLIS_TRMM3_KR_C BLIS_DEFAULT_KR_C -#define BLIS_TRMM3_KR_Z BLIS_DEFAULT_KR_Z - -#define BLIS_TRMM3_MR_S BLIS_DEFAULT_MR_S -#define BLIS_TRMM3_MR_D BLIS_DEFAULT_MR_D -#define BLIS_TRMM3_MR_C BLIS_DEFAULT_MR_C -#define BLIS_TRMM3_MR_Z BLIS_DEFAULT_MR_Z - -#define BLIS_TRMM3_NR_S BLIS_DEFAULT_NR_S -#define BLIS_TRMM3_NR_D BLIS_DEFAULT_NR_D -#define BLIS_TRMM3_NR_C BLIS_DEFAULT_NR_C -#define BLIS_TRMM3_NR_Z BLIS_DEFAULT_NR_Z - -// Incremental pack blocking - -#define BLIS_TRMM3_NI_S BLIS_DEFAULT_NI_S -#define BLIS_TRMM3_NI_D BLIS_DEFAULT_NI_D -#define BLIS_TRMM3_NI_C BLIS_DEFAULT_NI_C -#define BLIS_TRMM3_NI_Z BLIS_DEFAULT_NI_Z - void bli_trmm3_cntl_init() { // Create blocksize objects for each dimension. - trmm3_mc = bli_blksz_obj_create( BLIS_TRMM3_MC_S, - BLIS_TRMM3_MC_D, - BLIS_TRMM3_MC_C, - BLIS_TRMM3_MC_Z ); + trmm3_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, + BLIS_DEFAULT_MC_D, + BLIS_DEFAULT_MC_C, + BLIS_DEFAULT_MC_Z ); - trmm3_nc = bli_blksz_obj_create( BLIS_TRMM3_NC_S, - BLIS_TRMM3_NC_D, - BLIS_TRMM3_NC_C, - BLIS_TRMM3_NC_Z ); + trmm3_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, + BLIS_DEFAULT_NC_D, + BLIS_DEFAULT_NC_C, + BLIS_DEFAULT_NC_Z ); - trmm3_kc = bli_blksz_obj_create( BLIS_TRMM3_KC_S, - BLIS_TRMM3_KC_D, - BLIS_TRMM3_KC_C, - BLIS_TRMM3_KC_Z ); + trmm3_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, + BLIS_DEFAULT_KC_D, + BLIS_DEFAULT_KC_C, + BLIS_DEFAULT_KC_Z ); - trmm3_mr = bli_blksz_obj_create( BLIS_TRMM3_MR_S, - BLIS_TRMM3_MR_D, - BLIS_TRMM3_MR_C, - BLIS_TRMM3_MR_Z ); + trmm3_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, + BLIS_DEFAULT_MR_D, + BLIS_DEFAULT_MR_C, + BLIS_DEFAULT_MR_Z ); - trmm3_nr = bli_blksz_obj_create( BLIS_TRMM3_NR_S, - BLIS_TRMM3_NR_D, - BLIS_TRMM3_NR_C, - BLIS_TRMM3_NR_Z ); + trmm3_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, + BLIS_DEFAULT_NR_D, + BLIS_DEFAULT_NR_C, + BLIS_DEFAULT_NR_Z ); - trmm3_kr = bli_blksz_obj_create( BLIS_TRMM3_KR_S, - BLIS_TRMM3_KR_D, - BLIS_TRMM3_KR_C, - BLIS_TRMM3_KR_Z ); + trmm3_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, + BLIS_DEFAULT_KR_D, + BLIS_DEFAULT_KR_C, + BLIS_DEFAULT_KR_Z ); - trmm3_ni = bli_blksz_obj_create( BLIS_TRMM3_NI_S, - BLIS_TRMM3_NI_D, - BLIS_TRMM3_NI_C, - BLIS_TRMM3_NI_Z ); + trmm3_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + + trmm3_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + + trmm3_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S, + BLIS_EXTEND_KR_D, + BLIS_EXTEND_KR_C, + BLIS_EXTEND_KR_Z ); + + trmm3_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); // Create control tree objects for packm operations on a, b, and c. @@ -143,8 +120,10 @@ void bli_trmm3_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, // pack panels of A compactly - trmm3_mr, // IMPORTANT: for consistency with trsm, "k" dim - trmm3_mr, // multiple is set to mr. + // IMPORTANT: for consistency with trsm, "k" dim + // multiple is set to mr. + trmm3_mr, trmm3_extmr, + trmm3_mr, trmm3_extmr, FALSE, // do NOT scale by alpha TRUE, // densify FALSE, // do NOT invert diagonal @@ -157,8 +136,10 @@ void bli_trmm3_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - trmm3_mr, // IMPORTANT: m dim multiple here must be mr - trmm3_nr, // since "k" dim multiple is set to mr above. + // IMPORTANT: m dim multiple here must be mr + // since "k" dim multiple is set to mr above. + trmm3_mr, trmm3_extmr, + trmm3_nr, trmm3_extnr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -171,8 +152,8 @@ void bli_trmm3_cntl_init() = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, - trmm3_mr, - trmm3_nr, + trmm3_mr, trmm3_extmr, + trmm3_nr, trmm3_extnr, FALSE, // do NOT scale by beta FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index efe0b1b59..0f7fe847c 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -55,87 +55,64 @@ blksz_t* trsm_kc; blksz_t* trsm_mr; blksz_t* trsm_nr; blksz_t* trsm_kr; +blksz_t* trsm_extmr; +blksz_t* trsm_extnr; +blksz_t* trsm_extkr; blksz_t* trsm_ni; -// Cache blocksizes. - -#define BLIS_TRSM_KC_S BLIS_DEFAULT_KC_S -#define BLIS_TRSM_KC_D BLIS_DEFAULT_KC_D -#define BLIS_TRSM_KC_C BLIS_DEFAULT_KC_C -#define BLIS_TRSM_KC_Z BLIS_DEFAULT_KC_Z - -#define BLIS_TRSM_MC_S BLIS_DEFAULT_MC_S -#define BLIS_TRSM_MC_D BLIS_DEFAULT_MC_D -#define BLIS_TRSM_MC_C BLIS_DEFAULT_MC_C -#define BLIS_TRSM_MC_Z BLIS_DEFAULT_MC_Z - -#define BLIS_TRSM_NC_S BLIS_DEFAULT_NC_S -#define BLIS_TRSM_NC_D BLIS_DEFAULT_NC_D -#define BLIS_TRSM_NC_C BLIS_DEFAULT_NC_C -#define BLIS_TRSM_NC_Z BLIS_DEFAULT_NC_Z - -// Register blocking - -#define BLIS_TRSM_KR_S BLIS_DEFAULT_KR_S -#define BLIS_TRSM_KR_D BLIS_DEFAULT_KR_D -#define BLIS_TRSM_KR_C BLIS_DEFAULT_KR_C -#define BLIS_TRSM_KR_Z BLIS_DEFAULT_KR_Z - -#define BLIS_TRSM_MR_S BLIS_DEFAULT_MR_S -#define BLIS_TRSM_MR_D BLIS_DEFAULT_MR_D -#define BLIS_TRSM_MR_C BLIS_DEFAULT_MR_C -#define BLIS_TRSM_MR_Z BLIS_DEFAULT_MR_Z - -#define BLIS_TRSM_NR_S BLIS_DEFAULT_NR_S -#define BLIS_TRSM_NR_D BLIS_DEFAULT_NR_D -#define BLIS_TRSM_NR_C BLIS_DEFAULT_NR_C -#define BLIS_TRSM_NR_Z BLIS_DEFAULT_NR_Z - -// Incremental pack blocking - -#define BLIS_TRSM_NI_S BLIS_DEFAULT_NI_S -#define BLIS_TRSM_NI_D BLIS_DEFAULT_NI_D -#define BLIS_TRSM_NI_C BLIS_DEFAULT_NI_C -#define BLIS_TRSM_NI_Z BLIS_DEFAULT_NI_Z - void bli_trsm_cntl_init() { // Create blocksize objects for each dimension. - trsm_mc = bli_blksz_obj_create( BLIS_TRSM_MC_S, - BLIS_TRSM_MC_D, - BLIS_TRSM_MC_C, - BLIS_TRSM_MC_Z ); + trsm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S, + BLIS_DEFAULT_MC_D, + BLIS_DEFAULT_MC_C, + BLIS_DEFAULT_MC_Z ); - trsm_nc = bli_blksz_obj_create( BLIS_TRSM_NC_S, - BLIS_TRSM_NC_D, - BLIS_TRSM_NC_C, - BLIS_TRSM_NC_Z ); + trsm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S, + BLIS_DEFAULT_NC_D, + BLIS_DEFAULT_NC_C, + BLIS_DEFAULT_NC_Z ); - trsm_kc = bli_blksz_obj_create( BLIS_TRSM_KC_S, - BLIS_TRSM_KC_D, - BLIS_TRSM_KC_C, - BLIS_TRSM_KC_Z ); + trsm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S, + BLIS_DEFAULT_KC_D, + BLIS_DEFAULT_KC_C, + BLIS_DEFAULT_KC_Z ); - trsm_mr = bli_blksz_obj_create( BLIS_TRSM_MR_S, - BLIS_TRSM_MR_D, - BLIS_TRSM_MR_C, - BLIS_TRSM_MR_Z ); + trsm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S, + BLIS_DEFAULT_MR_D, + BLIS_DEFAULT_MR_C, + BLIS_DEFAULT_MR_Z ); - trsm_nr = bli_blksz_obj_create( BLIS_TRSM_NR_S, - BLIS_TRSM_NR_D, - BLIS_TRSM_NR_C, - BLIS_TRSM_NR_Z ); + trsm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S, + BLIS_DEFAULT_NR_D, + BLIS_DEFAULT_NR_C, + BLIS_DEFAULT_NR_Z ); - trsm_kr = bli_blksz_obj_create( BLIS_TRSM_KR_S, - BLIS_TRSM_KR_D, - BLIS_TRSM_KR_C, - BLIS_TRSM_KR_Z ); + trsm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, + BLIS_DEFAULT_KR_D, + BLIS_DEFAULT_KR_C, + BLIS_DEFAULT_KR_Z ); - trsm_ni = bli_blksz_obj_create( BLIS_TRSM_NI_S, - BLIS_TRSM_NI_D, - BLIS_TRSM_NI_C, - BLIS_TRSM_NI_Z ); + trsm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + + trsm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + + trsm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S, + BLIS_EXTEND_KR_D, + BLIS_EXTEND_KR_C, + BLIS_EXTEND_KR_Z ); + + trsm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); // Create control tree objects for packm operations on a, b, and c. @@ -143,8 +120,10 @@ void bli_trsm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, // pack panels of A compactly - trsm_mr, // IMPORTANT: n dim multiple must be mr to - trsm_mr, // support right and bottom-right edge cases + // IMPORTANT: n dim multiple must be mr to + // support right and bottom-right edge cases + trsm_mr, trsm_extmr, + trsm_mr, trsm_extmr, FALSE, // do NOT scale by alpha TRUE, // densify TRUE, // invert diagonal @@ -157,8 +136,10 @@ void bli_trsm_cntl_init() = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - trsm_mr, // IMPORTANT: m dim multiple must be mr since - trsm_nr, // B_pack is updated (ie: serves as C) in trsm + // IMPORTANT: m dim multiple must be mr since + // B_pack is updated (ie: serves as C) in trsm + trsm_mr, trsm_extmr, + trsm_nr, trsm_extnr, FALSE, // do NOT scale by alpha FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal @@ -171,8 +152,8 @@ void bli_trsm_cntl_init() = bli_packm_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, - trsm_mr, - trsm_nr, + trsm_mr, trsm_extmr, + trsm_nr, trsm_extnr, FALSE, // do NOT scale by beta FALSE, // already dense; densify not necessary FALSE, // do NOT invert diagonal diff --git a/frame/3/trsm/bli_trsm_l_ker_var2.c b/frame/3/trsm/bli_trsm_l_ker_var2.c index c73473bf1..6edee51fb 100644 --- a/frame/3/trsm/bli_trsm_l_ker_var2.c +++ b/frame/3/trsm/bli_trsm_l_ker_var2.c @@ -138,7 +138,7 @@ void PASTEMAC(ch,varname)( \ ) \ { \ /* Temporary buffer for duplicating elements of B. */ \ - ctype bd[ PASTEMAC(ch,kc) * \ + ctype bd[ PASTEMAC(ch,maxkc) * \ PASTEMAC(ch,nr) * \ PASTEMAC(ch,ndup) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ @@ -154,6 +154,8 @@ void PASTEMAC(ch,varname)( \ /* Alias constants to shorter names. */ \ const dim_t MR = PASTEMAC(ch,mr); \ const dim_t NR = PASTEMAC(ch,nr); \ + const dim_t PACKMR = PASTEMAC(ch,packmr); \ + const dim_t PACKNR = PASTEMAC(ch,packnr); \ const dim_t NDUP = PASTEMAC(ch,ndup); \ const bool_t DUPB = NDUP != 1; \ \ @@ -252,7 +254,7 @@ void PASTEMAC(ch,varname)( \ k_nr = k_a1011 * NR; \ \ /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = k * MR; \ + rstep_a = k * PACKMR; \ \ cstep_b = ps_b; \ \ @@ -305,14 +307,14 @@ void PASTEMAC(ch,varname)( \ k_a1011 = bli_min( k, diagoffa_i + MR ); \ k_a10 = k_a1011 - MR; \ \ - b11 = b1 + diagoffa_i * NR; \ + b11 = b1 + diagoffa_i * PACKNR; \ bp_i = bp + off_a1011 * NR * NDUP; \ \ /* Compute the addresses of the A10 panel and triangular block A11, and the corresponding panel Bd01 and block Bd11. */ \ a10 = a1; \ - a11 = a1 + k_a10 * MR; \ + a11 = a1 + k_a10 * PACKMR; \ bp01 = bp_i; \ bp11 = bp_i + k_a10 * NR * NDUP; \ \ @@ -354,7 +356,7 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR, c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1011 * MR; \ + a1 += k_a1011 * PACKMR; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ diff --git a/frame/3/trsm/bli_trsm_u_ker_var2.c b/frame/3/trsm/bli_trsm_u_ker_var2.c index 849a32cfa..ecb0977b1 100644 --- a/frame/3/trsm/bli_trsm_u_ker_var2.c +++ b/frame/3/trsm/bli_trsm_u_ker_var2.c @@ -138,7 +138,7 @@ void PASTEMAC(ch,varname)( \ ) \ { \ /* Temporary buffer for duplicating elements of B. */ \ - ctype bd[ PASTEMAC(ch,kc) * \ + ctype bd[ PASTEMAC(ch,maxkc) * \ PASTEMAC(ch,nr) * \ PASTEMAC(ch,ndup) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ @@ -154,6 +154,8 @@ void PASTEMAC(ch,varname)( \ /* Alias constants to shorter names. */ \ const dim_t MR = PASTEMAC(ch,mr); \ const dim_t NR = PASTEMAC(ch,nr); \ + const dim_t PACKMR = PASTEMAC(ch,packmr); \ + const dim_t PACKNR = PASTEMAC(ch,packnr); \ const dim_t NDUP = PASTEMAC(ch,ndup); \ const bool_t DUPB = NDUP != 1; \ \ @@ -252,7 +254,7 @@ void PASTEMAC(ch,varname)( \ k_nr = k_a1112 * NR; \ \ /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = k * MR; \ + rstep_a = k * PACKMR; \ \ cstep_b = ps_b; \ \ @@ -310,14 +312,14 @@ void PASTEMAC(ch,varname)( \ /* Index into b1 (if the diagonal offset is positive) to locate the MR x NR block of b1 that will be updated by the trsm subproblem. */ \ - b11 = b1 + off_a1112 * NR; \ + b11 = b1 + off_a1112 * PACKNR; \ bp_i = bp + off_a1112 * NR * NDUP; \ \ /* Compute the addresses of the A12 panel and triangular block A11, and the corresponding panel Bd21 and block Bd11. */ \ a11 = a1; \ - a12 = a1 + k_a11 * MR; \ + a12 = a1 + k_a11 * PACKMR; \ bp11 = bp_i; \ bp21 = bp_i + k_a11 * NR * NDUP; \ \ @@ -374,7 +376,7 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur, c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1112 * MR; \ + a1 += k_a1112 * PACKMR; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c index 9a7aac804..2affe9336 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c @@ -49,9 +49,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c, inc_t rs_c, inc_t cs_c \ ) \ { \ - const dim_t NR = PASTEMAC(ch,nr); \ -\ - const inc_t rs_b = NR; \ + const inc_t rs_b = PASTEMAC(ch,packnr); \ const inc_t cs_b = 1; \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c index 04a49bed4..bbfbb829a 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c @@ -49,9 +49,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c, inc_t rs_c, inc_t cs_c \ ) \ { \ - const dim_t NR = PASTEMAC(ch,nr); \ -\ - const inc_t rs_b = NR; \ + const inc_t rs_b = PASTEMAC(ch,packnr); \ const inc_t cs_b = 1; \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ diff --git a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c index 13687457e..adcc124c7 100644 --- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c @@ -45,16 +45,13 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c, inc_t rs_c, inc_t cs_c \ ) \ { \ - const dim_t MR = PASTEMAC(ch,mr); \ - const dim_t NR = PASTEMAC(ch,nr); \ -\ - const dim_t m = MR; \ - const dim_t n = NR; \ + const dim_t m = PASTEMAC(ch,mr); \ + const dim_t n = PASTEMAC(ch,nr); \ \ const inc_t rs_a = 1; \ - const inc_t cs_a = MR; \ + const inc_t cs_a = PASTEMAC(ch,packmr); \ \ - const inc_t rs_b = NR; \ + const inc_t rs_b = PASTEMAC(ch,packnr); \ const inc_t cs_b = 1; \ \ dim_t iter, i, j, k; \ diff --git a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c index e88fcabf5..f67ded6f8 100644 --- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c @@ -45,16 +45,13 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c, inc_t rs_c, inc_t cs_c \ ) \ { \ - const dim_t MR = PASTEMAC(ch,mr); \ - const dim_t NR = PASTEMAC(ch,nr); \ -\ - const dim_t m = MR; \ - const dim_t n = NR; \ + const dim_t m = PASTEMAC(ch,mr); \ + const dim_t n = PASTEMAC(ch,nr); \ \ const inc_t rs_a = 1; \ - const inc_t cs_a = MR; \ + const inc_t cs_a = PASTEMAC(ch,packmr); \ \ - const inc_t rs_b = NR; \ + const inc_t rs_b = PASTEMAC(ch,packnr); \ const inc_t cs_b = 1; \ \ dim_t iter, i, j, k; \ diff --git a/frame/base/bli_blocksize.c b/frame/base/bli_blocksize.c index d98866aa4..adae66e13 100644 --- a/frame/base/bli_blocksize.c +++ b/frame/base/bli_blocksize.c @@ -81,11 +81,19 @@ dim_t bli_blksz_for_obj( obj_t* obj, return b->v[ bli_obj_datatype( *obj ) ]; } +extern blksz_t* gemm_mc; +extern blksz_t* gemm_nc; +extern blksz_t* gemm_kc; +extern blksz_t* gemm_mr; +extern blksz_t* gemm_nr; +extern blksz_t* gemm_kr; + dim_t bli_determine_blocksize_f( dim_t i, dim_t dim, obj_t* obj, blksz_t* b ) { +#if 0 num_t dt; dim_t b_alg; @@ -103,7 +111,138 @@ dim_t bli_determine_blocksize_f( dim_t i, // smaller, in which case we return that remaining value. b_alg = bli_min( b_alg, dim - i ); +//printf( "bli_determine_blocksize0: returning %lu\n", b_alg ); + return b_alg; +#endif + +#if 0 + num_t dt; + dim_t b_alg, b_now; + dim_t mc, nc, kc; + dim_t mr, nr, kr; + dim_t dim_left_now; + + dt = bli_obj_execution_datatype( *obj ); + b_alg = bli_blksz_for_type( dt, b ); + + mc = bli_blksz_for_type( dt, gemm_mc ); + nc = bli_blksz_for_type( dt, gemm_nc ); + kc = bli_blksz_for_type( dt, gemm_kc ); + + mr = bli_blksz_for_type( dt, gemm_mr ); + nr = bli_blksz_for_type( dt, gemm_nr ); + kr = bli_blksz_for_type( dt, gemm_kr ); + + dim_left_now = dim - i; + + if ( dim_left_now <= b_alg ) + { + b_now = dim_left_now; + } + else if ( dim_left_now <= b_alg + (b_alg/4) ) + { + b_now = dim_left_now / 2; + + // This actually wno't work when, for example, mc == kc but mr != kr. + if ( b_alg == mc ) b_now = bli_align_dim_to_mult( b_now, mr ); + else if ( b_alg == nc ) b_now = bli_align_dim_to_mult( b_now, nr ); + else if ( b_alg == kc ) b_now = bli_align_dim_to_mult( b_now, kr ); + } + else + { + b_now = b_alg; + } + +//printf( "bli_determine_blocksize1: returning %lu\n", b_now ); + + return b_now; +#endif + +#if 0 + num_t dt; + dim_t b_alg, b_now; + dim_t mc, nc, kc; + dim_t mr, nr, kr; + dim_t dim_left_now; + + dt = bli_obj_execution_datatype( *obj ); + b_alg = bli_blksz_for_type( dt, b ); + + mc = bli_blksz_for_type( dt, gemm_mc ); + nc = bli_blksz_for_type( dt, gemm_nc ); + kc = bli_blksz_for_type( dt, gemm_kc ); + + mr = bli_blksz_for_type( dt, gemm_mr ); + nr = bli_blksz_for_type( dt, gemm_nr ); + kr = bli_blksz_for_type( dt, gemm_kr ); + + dim_left_now = dim - i; + + if ( dim_left_now <= b_alg ) + { + b_now = dim_left_now; + } + else if ( dim_left_now <= 2 * b_alg ) + { + b_now = dim_left_now / 2; + + // This actually wno't work when, for example, mc == kc but mr != kr. + if ( b_alg == mc ) b_now = bli_align_dim_to_mult( b_now, mr ); + else if ( b_alg == nc ) b_now = bli_align_dim_to_mult( b_now, nr ); + else if ( b_alg == kc ) b_now = bli_align_dim_to_mult( b_now, kr ); + } + else + { + b_now = b_alg; + } + +//printf( "bli_determine_blocksize2: returning %lu\n", b_now ); + + return b_now; +#endif + +#ifdef BLIS_EDGECASE_HACK + num_t dt; + dim_t b_alg, b_now; + dim_t dim_left_now; + + dt = bli_obj_execution_datatype( *obj ); + b_alg = bli_blksz_for_type( dt, b ); + + dim_left_now = dim - i; + + if ( dim_left_now <= b_alg + b_alg/4 ) + { + b_now = dim_left_now; + } + else + { + b_now = b_alg; + } + + return b_now; +#else + num_t dt; + dim_t b_alg; + + // We assume that this function is being called from an algorithm that + // is moving "forward" (ie: top to bottom, left to right, top-left + // to bottom-right). + + // Extract the execution datatype and use it to query the corresponding + // blocksize value from the blksz_t object. + dt = bli_obj_execution_datatype( *obj ); + b_alg = bli_blksz_for_type( dt, b ); + + // If we are moving "forward" (ie: top to bottom, left to right, or + // top-left to bottom-right), then return b_alg, unless dim - 1 is + // smaller, in which case we return that remaining value. + b_alg = bli_min( b_alg, dim - i ); + + + return b_alg; +#endif } dim_t bli_determine_blocksize_b( dim_t i, diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c index ef72b264c..cd4731de9 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_mem.c @@ -37,9 +37,9 @@ // Define the size of pool blocks. These may be adjusted so that they can // handle inflated blocksizes at edge cases. -#define BLIS_POOL_MC_D BLIS_DEFAULT_MC_D -#define BLIS_POOL_KC_D BLIS_DEFAULT_KC_D -#define BLIS_POOL_NC_D BLIS_DEFAULT_NC_D +#define BLIS_POOL_MC_D ( ( BLIS_MAXIMUM_MC_D * BLIS_PACKDIM_MR_D ) / BLIS_DEFAULT_MR_D ) +#define BLIS_POOL_KC_D ( ( BLIS_MAXIMUM_KC_D * BLIS_PACKDIM_KR_D ) / BLIS_DEFAULT_KR_D ) +#define BLIS_POOL_NC_D ( ( BLIS_MAXIMUM_NC_D * BLIS_PACKDIM_NR_D ) / BLIS_DEFAULT_NR_D ) // Define each pool's block size. // NOTE: Here we assume the "worst" case of the register blocking diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 240f0d957..4e7187a95 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -510,8 +510,8 @@ void bli_obj_print( char* label, obj_t* obj ) fprintf( file, " - buf %p\n", bli_mem_buffer( pack_mem ) ); fprintf( file, " - buf_type %u\n", bli_mem_buf_type( pack_mem ) ); fprintf( file, " - size %lu\n", bli_mem_size( pack_mem ) ); - fprintf( file, " m_packed %lu\n", bli_obj_packed_length( *obj ) ); - fprintf( file, " n_packed %lu\n", bli_obj_packed_width( *obj ) ); + fprintf( file, " m_padded %lu\n", bli_obj_padded_length( *obj ) ); + fprintf( file, " n_padded %lu\n", bli_obj_padded_width( *obj ) ); fprintf( file, " ps %lu\n", bli_obj_panel_stride( *obj ) ); fprintf( file, "\n" ); diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index bc1f02549..e3955a9b9 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -35,52 +35,214 @@ #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H +#define SIZEOF_S 4 +#define SIZEOF_D 8 +#define SIZEOF_C 8 +#define SIZEOF_Z 16 -// Redefine kernel blocksizes, defined in bli_kernel.h, to shorter -// names that can be derived via PASTEMAC macro. -// Cache blocksizes +// -- Kernel macro checks ------------------------------------------------------ -#define bli_smc BLIS_DEFAULT_MC_S -#define bli_snc BLIS_DEFAULT_NC_S -#define bli_skc BLIS_DEFAULT_KC_S +// Verify that cache blocksizes are whole multiples of register blocksizes. +// Specifically, verify that: +// - MC is a whole multiple of MR. +// - NC is a whole multiple of NR. +// - KC is a whole multiple of KR. +// These constraints are enforced because it makes it easier to handle diagonals +// in the macro-kernel implementations. +#if ( \ + ( BLIS_DEFAULT_MC_S % BLIS_DEFAULT_MR_S != 0 ) || \ + ( BLIS_DEFAULT_MC_D % BLIS_DEFAULT_MR_D != 0 ) || \ + ( BLIS_DEFAULT_MC_C % BLIS_DEFAULT_MR_C != 0 ) || \ + ( BLIS_DEFAULT_MC_Z % BLIS_DEFAULT_MR_Z != 0 ) \ + ) + #error MC must be multiple of MR for all datatypes. +#endif -#define bli_dmc BLIS_DEFAULT_MC_D -#define bli_dnc BLIS_DEFAULT_NC_D -#define bli_dkc BLIS_DEFAULT_KC_D +#if ( \ + ( BLIS_DEFAULT_NC_S % BLIS_DEFAULT_NR_S != 0 ) || \ + ( BLIS_DEFAULT_NC_D % BLIS_DEFAULT_NR_D != 0 ) || \ + ( BLIS_DEFAULT_NC_C % BLIS_DEFAULT_NR_C != 0 ) || \ + ( BLIS_DEFAULT_NC_Z % BLIS_DEFAULT_NR_Z != 0 ) \ + ) + #error NC must be multiple of NR for all datatypes. +#endif -#define bli_cmc BLIS_DEFAULT_MC_C -#define bli_cnc BLIS_DEFAULT_NC_C -#define bli_ckc BLIS_DEFAULT_KC_C +#if ( \ + ( BLIS_DEFAULT_KC_S % BLIS_DEFAULT_KR_S != 0 ) || \ + ( BLIS_DEFAULT_KC_D % BLIS_DEFAULT_KR_D != 0 ) || \ + ( BLIS_DEFAULT_KC_C % BLIS_DEFAULT_KR_C != 0 ) || \ + ( BLIS_DEFAULT_KC_Z % BLIS_DEFAULT_KR_Z != 0 ) \ + ) + #error KC must be multiple of KR for all datatypes. +#endif -#define bli_zmc BLIS_DEFAULT_MC_Z -#define bli_znc BLIS_DEFAULT_NC_Z -#define bli_zkc BLIS_DEFAULT_KC_Z +// Verify that cache blocksizes indicate consistent storage. +// Specifically, verify that: +// - MC_D * KC_D >= MC_? * KC_?. +// - KC_D * NC_D >= KC_? * NC_?. +// - MC_D * NC_D >= MC_? * NC_?. +// These constraints are enforced because static memory is allocated for the +// contiguous memory allocator using the double-precision real values of MC, +// NC, and KC. +#if ( \ + ( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_MC_S * BLIS_DEFAULT_KC_S * SIZEOF_S ) ) || \ + ( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_MC_C * BLIS_DEFAULT_KC_C * SIZEOF_C ) ) || \ + ( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_MC_Z * BLIS_DEFAULT_KC_Z * SIZEOF_Z ) ) \ + ) + #error MC_D*KC_D must be >= that of MC*KC for all other datatypes. +#endif + +#if ( \ + ( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_KC_S * BLIS_DEFAULT_NC_S * SIZEOF_S ) ) || \ + ( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_KC_C * BLIS_DEFAULT_NC_C * SIZEOF_C ) ) || \ + ( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_KC_Z * BLIS_DEFAULT_NC_Z * SIZEOF_Z ) ) \ + ) + #error KC_D*NC_D must be >= that of KC*NC for all other datatypes. +#endif + +/* +#if ( \ + ( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_MC_S * BLIS_DEFAULT_NC_S * SIZEOF_S ) ) || \ + ( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_MC_C * BLIS_DEFAULT_NC_C * SIZEOF_C ) ) || \ + ( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \ + ( BLIS_DEFAULT_MC_Z * BLIS_DEFAULT_NC_Z * SIZEOF_Z ) ) \ + ) + #error MC_D*NC_D must be >= that of MC*NC for all other datatypes. +#endif +*/ + + +// -- Compute maximum cache blocksizes ----------------------------------------- + +#define BLIS_MAXIMUM_MC_S ( BLIS_DEFAULT_MC_S + BLIS_EXTEND_MC_S ) +#define BLIS_MAXIMUM_KC_S ( BLIS_DEFAULT_KC_S + BLIS_EXTEND_KC_S ) +#define BLIS_MAXIMUM_NC_S ( BLIS_DEFAULT_NC_S + BLIS_EXTEND_NC_S ) + +#define BLIS_MAXIMUM_MC_D ( BLIS_DEFAULT_MC_D + BLIS_EXTEND_MC_D ) +#define BLIS_MAXIMUM_KC_D ( BLIS_DEFAULT_KC_D + BLIS_EXTEND_KC_D ) +#define BLIS_MAXIMUM_NC_D ( BLIS_DEFAULT_NC_D + BLIS_EXTEND_NC_D ) + +#define BLIS_MAXIMUM_MC_C ( BLIS_DEFAULT_MC_C + BLIS_EXTEND_MC_C ) +#define BLIS_MAXIMUM_KC_C ( BLIS_DEFAULT_KC_C + BLIS_EXTEND_KC_C ) +#define BLIS_MAXIMUM_NC_C ( BLIS_DEFAULT_NC_C + BLIS_EXTEND_NC_C ) + +#define BLIS_MAXIMUM_MC_Z ( BLIS_DEFAULT_MC_Z + BLIS_EXTEND_MC_Z ) +#define BLIS_MAXIMUM_KC_Z ( BLIS_DEFAULT_KC_Z + BLIS_EXTEND_KC_Z ) +#define BLIS_MAXIMUM_NC_Z ( BLIS_DEFAULT_NC_Z + BLIS_EXTEND_NC_Z ) + + +// -- Compute leading dim blocksizes used for packing -------------------------- + +#define BLIS_PACKDIM_MR_S ( BLIS_DEFAULT_MR_S + BLIS_EXTEND_MR_S ) +#define BLIS_PACKDIM_KR_S ( BLIS_DEFAULT_KR_S + BLIS_EXTEND_KR_S ) +#define BLIS_PACKDIM_NR_S ( BLIS_DEFAULT_NR_S + BLIS_EXTEND_NR_S ) + +#define BLIS_PACKDIM_MR_D ( BLIS_DEFAULT_MR_D + BLIS_EXTEND_MR_D ) +#define BLIS_PACKDIM_KR_D ( BLIS_DEFAULT_KR_D + BLIS_EXTEND_KR_D ) +#define BLIS_PACKDIM_NR_D ( BLIS_DEFAULT_NR_D + BLIS_EXTEND_NR_D ) + +#define BLIS_PACKDIM_MR_C ( BLIS_DEFAULT_MR_C + BLIS_EXTEND_MR_C ) +#define BLIS_PACKDIM_KR_C ( BLIS_DEFAULT_KR_C + BLIS_EXTEND_KR_C ) +#define BLIS_PACKDIM_NR_C ( BLIS_DEFAULT_NR_C + BLIS_EXTEND_NR_C ) + +#define BLIS_PACKDIM_MR_Z ( BLIS_DEFAULT_MR_Z + BLIS_EXTEND_MR_Z ) +#define BLIS_PACKDIM_KR_Z ( BLIS_DEFAULT_KR_Z + BLIS_EXTEND_KR_Z ) +#define BLIS_PACKDIM_NR_Z ( BLIS_DEFAULT_NR_Z + BLIS_EXTEND_NR_Z ) + + +// -- Abbreiviated kernel blocksize macros ------------------------------------- + +// Here, we shorten the blocksizes defined in bli_kernel.h so that they can +// derived via the PASTEMAC macro. + +// Default cache blocksizes + +#define bli_smc BLIS_DEFAULT_MC_S +#define bli_skc BLIS_DEFAULT_KC_S +#define bli_snc BLIS_DEFAULT_NC_S + +#define bli_dmc BLIS_DEFAULT_MC_D +#define bli_dkc BLIS_DEFAULT_KC_D +#define bli_dnc BLIS_DEFAULT_NC_D + +#define bli_cmc BLIS_DEFAULT_MC_C +#define bli_ckc BLIS_DEFAULT_KC_C +#define bli_cnc BLIS_DEFAULT_NC_C + +#define bli_zmc BLIS_DEFAULT_MC_Z +#define bli_zkc BLIS_DEFAULT_KC_Z +#define bli_znc BLIS_DEFAULT_NC_Z + +// Maximum cache blocksizes + +#define bli_smaxmc BLIS_MAXIMUM_MC_S +#define bli_smaxkc BLIS_MAXIMUM_KC_S +#define bli_smaxnc BLIS_MAXIMUM_NC_S + +#define bli_dmaxmc BLIS_MAXIMUM_MC_D +#define bli_dmaxkc BLIS_MAXIMUM_KC_D +#define bli_dmaxnc BLIS_MAXIMUM_NC_D + +#define bli_cmaxmc BLIS_MAXIMUM_MC_C +#define bli_cmaxkc BLIS_MAXIMUM_KC_C +#define bli_cmaxnc BLIS_MAXIMUM_NC_C + +#define bli_zmaxmc BLIS_MAXIMUM_MC_Z +#define bli_zmaxkc BLIS_MAXIMUM_KC_Z +#define bli_zmaxnc BLIS_MAXIMUM_NC_Z // Register blocksizes -#define bli_smr BLIS_DEFAULT_MR_S -#define bli_snr BLIS_DEFAULT_NR_S -#define bli_skr BLIS_DEFAULT_KR_S +#define bli_smr BLIS_DEFAULT_MR_S +#define bli_skr BLIS_DEFAULT_KR_S +#define bli_snr BLIS_DEFAULT_NR_S -#define bli_dmr BLIS_DEFAULT_MR_D -#define bli_dnr BLIS_DEFAULT_NR_D -#define bli_dkr BLIS_DEFAULT_KR_D +#define bli_dmr BLIS_DEFAULT_MR_D +#define bli_dkr BLIS_DEFAULT_KR_D +#define bli_dnr BLIS_DEFAULT_NR_D -#define bli_cmr BLIS_DEFAULT_MR_C -#define bli_cnr BLIS_DEFAULT_NR_C -#define bli_ckr BLIS_DEFAULT_KR_C +#define bli_cmr BLIS_DEFAULT_MR_C +#define bli_ckr BLIS_DEFAULT_KR_C +#define bli_cnr BLIS_DEFAULT_NR_C -#define bli_zmr BLIS_DEFAULT_MR_Z -#define bli_znr BLIS_DEFAULT_NR_Z -#define bli_zkr BLIS_DEFAULT_KR_Z +#define bli_zmr BLIS_DEFAULT_MR_Z +#define bli_zkr BLIS_DEFAULT_KR_Z +#define bli_znr BLIS_DEFAULT_NR_Z -// Duplication +// Micro-panel packing register blocksizes + +#define bli_spackmr BLIS_PACKDIM_MR_S +#define bli_spackkr BLIS_PACKDIM_KR_S +#define bli_spacknr BLIS_PACKDIM_NR_S + +#define bli_dpackmr BLIS_PACKDIM_MR_D +#define bli_dpackkr BLIS_PACKDIM_KR_D +#define bli_dpacknr BLIS_PACKDIM_NR_D + +#define bli_cpackmr BLIS_PACKDIM_MR_C +#define bli_cpackkr BLIS_PACKDIM_KR_C +#define bli_cpacknr BLIS_PACKDIM_NR_C + +#define bli_zpackmr BLIS_PACKDIM_MR_Z +#define bli_zpackkr BLIS_PACKDIM_KR_Z +#define bli_zpacknr BLIS_PACKDIM_NR_Z + +// Duplication factors + +#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S +#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D +#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C +#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z -#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S -#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D -#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C -#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z #endif diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index b242abe31..77c47f85f 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -710,30 +710,44 @@ bli_obj_width_stored( obj ) // Packed dimensions query -#define bli_obj_packed_length( obj ) \ +#define bli_obj_padded_length( obj ) \ \ - ( (obj).m_packed ) + ( (obj).m_padded ) -#define bli_obj_packed_width( obj ) \ +#define bli_obj_padded_width( obj ) \ \ - ( (obj).n_packed ) + ( (obj).n_padded ) // Packed dimensions modification -#define bli_obj_set_packed_length( m0, obj ) \ +#define bli_obj_set_padded_length( m0, obj ) \ { \ - (obj).m_packed = m0; \ + (obj).m_padded = m0; \ } -#define bli_obj_set_packed_width( n0, obj ) \ +#define bli_obj_set_padded_width( n0, obj ) \ { \ - (obj).n_packed = n0; \ + (obj).n_padded = n0; \ } -#define bli_obj_set_packed_dims( m0, n0, obj ) \ +#define bli_obj_set_padded_dims( m0, n0, obj ) \ { \ - bli_obj_set_packed_length( m0, obj ); \ - bli_obj_set_packed_width( n0, obj ); \ + bli_obj_set_padded_length( m0, obj ); \ + bli_obj_set_padded_width( n0, obj ); \ +} + + +// Packed panel dimension query + +#define bli_obj_panel_dim( obj ) \ +\ + ((obj).pd) + +// Packed panel dimension modification + +#define bli_obj_set_panel_dim( panel_dim, obj ) \ +{ \ + (obj).pd = panel_dim; \ } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index a9d609abb..7eafe595d 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -388,9 +388,11 @@ typedef struct obj_s // Pack-related fields mem_t pack_mem; // cached memory region for packing - dim_t m_packed; - dim_t n_packed; + dim_t m_padded; // m dimension of matrix, including any padding + dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) + inc_t pd; // panel dimension (the "width" of a panel: + // usually MR or NR) //mem_t cast_mem; // cached memory region for casting @@ -445,8 +447,9 @@ typedef struct obj_s those situations, we want the subpartition to inherit the pack_mem field, and the corresponding packed dimensions, of its parent. */ \ (b).pack_mem = (a).pack_mem; \ - (b).m_packed = (a).m_packed; \ - (b).n_packed = (a).n_packed; \ + (b).m_padded = (a).m_padded; \ + (b).n_padded = (a).n_padded; \ + (b).pd = (a).pd; \ (b).ps = (a).ps; \ \ /*(b).cast_mem = (a).cast_mem;*/ \ diff --git a/test/test_blis2.c b/test/test_blis2.c index c697876d3..3f38258b8 100644 --- a/test/test_blis2.c +++ b/test/test_blis2.c @@ -240,8 +240,8 @@ int main( int argc, char** argv ) packm_cntl_a = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - mr, - kr, + mr, NULL, + kr, NULL, TRUE, // scale? TRUE, // densify? FALSE, // invert diagonal? @@ -252,8 +252,8 @@ int main( int argc, char** argv ) packm_cntl_b = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - kr, - nr, + kr, NULL, + nr, NULL, FALSE, // scale? FALSE, // densify? FALSE, // invert diagonal? diff --git a/test/test_gemm.c b/test/test_gemm.c index 696418bc0..79c620a07 100644 --- a/test/test_gemm.c +++ b/test/test_gemm.c @@ -38,12 +38,11 @@ // transa transb m n k alpha a lda b ldb beta c ldc void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); -//#define PRINT +#define PRINT int main( int argc, char** argv ) { obj_t a, b, c; - obj_t a_pack, b_pack; obj_t c_save; obj_t alpha, beta; dim_t m, n, k; @@ -54,6 +53,9 @@ int main( int argc, char** argv ) num_t dt_alpha, dt_beta; int r, n_repeats; +#if 0 + obj_t a_pack, b_pack; + blksz_t* mr; blksz_t* nr; blksz_t* kr; @@ -70,6 +72,7 @@ int main( int argc, char** argv ) gemm_t* gemm_cntl_op_bp; gemm_t* gemm_cntl_mm_op; gemm_t* gemm_cntl_vl_mm; +#endif double dtime; double dtime_save; @@ -132,6 +135,7 @@ int main( int argc, char** argv ) bli_setsc( (2.0/1.0), 0.0, &alpha ); bli_setsc( -(1.0/1.0), 0.0, &beta ); +#if 0 mr = bli_blksz_obj_create( 2, 4, 2, 2 ); kr = bli_blksz_obj_create( 1, 1, 1, 1 ); nr = bli_blksz_obj_create( 1, 4, 1, 1 ); @@ -215,7 +219,7 @@ int main( int argc, char** argv ) bli_obj_init_pack( &a_pack ); bli_obj_init_pack( &b_pack ); - +#endif bli_copym( &c, &c_save ); @@ -291,6 +295,7 @@ int main( int argc, char** argv ) printf( "( %2ld, 1:5 ) = [ %4lu %4lu %4lu %10.3e %6.3f ];\n", (p - p_begin + 1)/p_inc + 1, m, k, n, dtime_save, gflops ); +#if 0 bli_obj_release_pack( &a_pack ); bli_obj_release_pack( &b_pack ); @@ -309,6 +314,7 @@ int main( int argc, char** argv ) bli_cntl_obj_free( gemm_cntl_op_bp ); bli_cntl_obj_free( gemm_cntl_mm_op ); bli_cntl_obj_free( gemm_cntl_vl_mm ); +#endif bli_obj_free( &alpha ); bli_obj_free( &beta ); diff --git a/test/test_hemm.c b/test/test_hemm.c index 84777693e..6af82b227 100644 --- a/test/test_hemm.c +++ b/test/test_hemm.c @@ -149,8 +149,8 @@ int main( int argc, char** argv ) packm_cntl_a = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - mr, - kr, + mr, NULL, + kr, NULL, FALSE, // scale? TRUE, // densify? FALSE, // invert diagonal? @@ -162,8 +162,8 @@ int main( int argc, char** argv ) packm_cntl_b = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - kr, - nr, + kr, NULL, + nr, NULL, FALSE, // scale? FALSE, // densify? FALSE, // invert diagonal? diff --git a/test/test_her2k.c b/test/test_her2k.c index 8a4a23f45..2fab5d98e 100644 --- a/test/test_her2k.c +++ b/test/test_her2k.c @@ -146,8 +146,8 @@ int main( int argc, char** argv ) packm_cntl_a = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - mr, - kr, + mr, NULL, + kr, NULL, FALSE, // scale? FALSE, // densify? FALSE, // invert diagonal? @@ -159,8 +159,8 @@ int main( int argc, char** argv ) packm_cntl_b = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - kr, - nr, + kr, NULL, + nr, NULL, FALSE, // scale? FALSE, // densify? FALSE, // invert diagonal? diff --git a/test/test_herk.c b/test/test_herk.c index dc774e8a6..71063ae06 100644 --- a/test/test_herk.c +++ b/test/test_herk.c @@ -142,8 +142,8 @@ int main( int argc, char** argv ) packm_cntl_a = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - mr, - kr, + mr, NULL, + kr, NULL, FALSE, // scale? FALSE, // densify? FALSE, // invert diagonal? @@ -155,8 +155,8 @@ int main( int argc, char** argv ) packm_cntl_b = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - kr, - nr, + kr, NULL, + nr, NULL, FALSE, // scale? FALSE, // densify? FALSE, // invert diagonal? diff --git a/test/test_trmm.c b/test/test_trmm.c index ffc50caec..d063790ab 100644 --- a/test/test_trmm.c +++ b/test/test_trmm.c @@ -162,8 +162,8 @@ int main( int argc, char** argv ) packm_cntl_a = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, - mr, // IMPORTANT: for consistency with trsm, "k" dim - mr, // multiple is set to mr. + mr, NULL, // IMPORTANT: for consistency with trsm, "k" dim + mr, NULL, // multiple is set to mr. FALSE, // scale? TRUE, // densify? FALSE, // invert diagonal? @@ -175,8 +175,8 @@ int main( int argc, char** argv ) packm_cntl_b = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - mr, // IMPORTANT: m dim multiple here must be mr - nr, // since "k" dim multiple is set to mr above. + mr, NULL, // IMPORTANT: m dim multiple here must be mr + nr, NULL, // since "k" dim multiple is set to mr above. FALSE, // scale? FALSE, // densify? FALSE, // invert diagonal? diff --git a/test/test_trsm.c b/test/test_trsm.c index 193e5f358..0fd8e8703 100644 --- a/test/test_trsm.c +++ b/test/test_trsm.c @@ -151,8 +151,8 @@ int main( int argc, char** argv ) packm_cntl_a = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, - mr, // IMPORTANT: "k" dim multiple must be mr to - mr, // support using ukernel for right/bottom-right + mr, NULL, // IMPORTANT: "k" dim multiple must be mr to + mr, NULL, // support using ukernel for right/bottom-right // edge cases (see macro-kernel for comments). FALSE, // scale? TRUE, // densify? @@ -165,8 +165,8 @@ int main( int argc, char** argv ) packm_cntl_b = bli_packm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, - mr, // IMPORTANT: m dim multiple here must be mr - nr, // since "k" dim multiple is set to mr above. + mr, NULL, // IMPORTANT: m dim multiple here must be mr + nr, NULL, // since "k" dim multiple is set to mr above. TRUE, // scale? FALSE, // densify? FALSE, // invert diagonal? diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 45dec07b7..d035bce62 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -490,6 +490,23 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) BLIS_DEFAULT_NC_C, BLIS_DEFAULT_NC_Z ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "level-3 cache blksz exts s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_MC_S, + BLIS_EXTEND_MC_D, + BLIS_EXTEND_MC_C, + BLIS_EXTEND_MC_Z ); + libblis_test_fprintf_c( os, " k dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_KC_S, + BLIS_EXTEND_KC_D, + BLIS_EXTEND_KC_C, + BLIS_EXTEND_KC_Z ); + libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_NC_S, + BLIS_EXTEND_NC_D, + BLIS_EXTEND_NC_C, + BLIS_EXTEND_NC_Z ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "level-3 register blocksizes \n" ); libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n", BLIS_DEFAULT_MR_S, @@ -502,6 +519,18 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) BLIS_DEFAULT_NR_C, BLIS_DEFAULT_NR_Z ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "level-3 register blksz exts s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_MR_S, + BLIS_EXTEND_MR_D, + BLIS_EXTEND_MR_C, + BLIS_EXTEND_MR_Z ); + libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_NR_S, + BLIS_EXTEND_NR_D, + BLIS_EXTEND_NR_C, + BLIS_EXTEND_NR_Z ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "level-3 packing duplication \n" ); libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n", BLIS_DEFAULT_NUM_DUPL_S,