mirror of
https://github.com/amd/blis.git
synced 2026-06-06 04:34:02 +00:00
Allow ldim of packed micro-panels != MR, NR.
Details:
- Made substantial changes throughout the framework to decouple the leading
dimension (row or column stride) used within each packed micro-panel from
the corresponding register blocksize. It appears advantageous on some
systems to use, for example, packed micro-panels of A where the column
stride is greater than MR (whereas previously it was always equal to MR).
- Changes include:
- Added BLIS_EXTEND_[MNK]R_? macros, which specify how much extra padding
to use when packing micro-panels of A and B.
- Adjusted all packing routines and macro-kernels to use PACKMR and PACKNR
where appropriate, instead of MR and NR.
- Added pd field (panel dimension) to obj_t.
- New interface to bli_packm_cntl_obj_create().
- Renamed bli_obj_packed_length()/_width() macros to
bli_obj_padded_length()/_width().
- Removed local #defines for cache/register blocksizes in level-3 *_cntl.c.
- Print out new cache and register blocksize extensions in test suite.
- Also added new BLIS_EXTEND_[MNK]C_? macros for future use in using a larger
blocksize for edge cases, which can improve performance at the margins.
This commit is contained in:
@@ -77,7 +77,29 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 8192
|
||||
|
||||
// -- Default register blocksizes for inner kernel --
|
||||
// -- Ccache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above.
|
||||
|
||||
// NOTE: These values are not yet used.
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
@@ -104,6 +126,31 @@
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
@@ -162,7 +209,7 @@
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
|
||||
137
config/flame/bli_config.h
Normal file
137
config/flame/bli_config.h
Normal file
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
|
||||
|
||||
#define BLIS_NUM_FP_TYPES 4
|
||||
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 24
|
||||
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
// -- Contiguous (static) memory allocator --
|
||||
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS 1
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
// memory pools so that the micro-kernel, when computing with the end of the
|
||||
// last block, can exceed the bounds of the usable portion of the memory
|
||||
// region without causing a segmentation fault.
|
||||
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
|
||||
|
||||
// -- Memory alignment --
|
||||
|
||||
// It is sometimes useful to define the various memory alignments in terms
|
||||
// of some other characteristics of the system, such as the cache line size
|
||||
// and the page size.
|
||||
#define BLIS_CACHE_LINE_SIZE 64
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used when allocating entire blocks of contiguous memory
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE 16
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
// Basic (homogeneous) datatype support always enabled.
|
||||
|
||||
// Enable mixed domain operations?
|
||||
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
|
||||
// Enable extra mixed precision operations?
|
||||
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
|
||||
|
||||
|
||||
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
|
||||
|
||||
// Stay initialized after auto-initialization, unless and until the user
|
||||
// explicitly calls bli_finalize().
|
||||
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
|
||||
|
||||
|
||||
|
||||
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
|
||||
|
||||
// Enable the BLAS compatibility layer?
|
||||
#define BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
335
config/flame/bli_kernel.h
Normal file
335
config/flame/bli_kernel.h
Normal file
@@ -0,0 +1,335 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR
|
||||
// for triangular operations such as trmm and trsm.
|
||||
//
|
||||
// NOTE: For BLIS libraries built on block-panel macro-kernels, constraint (3b)
|
||||
// is relaxed. In this case, (3a) is needed for operations where matrix A is
|
||||
// triangular (trmm, trsm), because we want the diagonal offset of any packed
|
||||
// panel of matrix A to be a multiple of MR. If, instead, the library were to
|
||||
// be built on block-panel macro-kernels, the matrix with structure would be
|
||||
// on the right, rather than the left, and thus it would be constraint (3b)
|
||||
// that would be needed instead of (3a).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 128
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
//#define BLIS_EDGECASE_HACK 1
|
||||
|
||||
// -- Default register blocksizes for inner kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 2
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 8
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 2
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
// used by certain blocked variants. But when the *are* used, they MUST be
|
||||
// be an integer multiple of NR!
|
||||
|
||||
#define BLIS_DEFAULT_NI_FAC 16
|
||||
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
|
||||
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
|
||||
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
|
||||
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
|
||||
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_d4x2.h"
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
//#define GEMM_UKERNEL gemm_ref_4x4
|
||||
#define GEMM_UKERNEL gemm_opt_d4x2
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_4x4
|
||||
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_4x4
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
//#define TRSM_L_UKERNEL trsm_l_ref_4x4
|
||||
//#define TRSM_U_UKERNEL trsm_u_ref_4x4
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copynzv --
|
||||
|
||||
#define COPYNZV_KERNEL copynzv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
1
config/flame/kernels
Symbolic link
1
config/flame/kernels
Symbolic link
@@ -0,0 +1 @@
|
||||
../../kernels/x86/3/
|
||||
104
config/flame/make_defs.mk
Normal file
104
config/flame/make_defs.mk
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2013, The University of Texas
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
# Only include this block of code once.
|
||||
ifndef MAKE_DEFS_MK_INCLUDED
|
||||
MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := gcc
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2 -malign-double -funroll-loops
|
||||
CVECFLAGS := -msse3 -march=native # -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into two groups: one for optimizable code, and
|
||||
# one for code that should not be optimized.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS :=
|
||||
|
||||
|
||||
|
||||
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
|
||||
endif
|
||||
@@ -77,7 +77,29 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Default register blocksizes for inner kernel --
|
||||
// -- Ccache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above.
|
||||
|
||||
// NOTE: These values are not yet used.
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
@@ -104,6 +126,31 @@
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
|
||||
@@ -175,7 +175,7 @@ void bli_packv_init_pack( pack_t pack_schema,
|
||||
}
|
||||
|
||||
// Save the padded (packed) dimensions into the packed object.
|
||||
bli_obj_set_packed_dims( m_p_pad, 1, *p );
|
||||
bli_obj_set_padded_dims( m_p_pad, 1, *p );
|
||||
|
||||
// Grab the buffer address from the mem_t object and copy it to the
|
||||
// main object buffer field. (Sometimes this buffer address will be
|
||||
@@ -193,7 +193,7 @@ void bli_packv_init_pack( pack_t pack_schema,
|
||||
// how much space beyond the vector would need to be zero-padded, if
|
||||
// zero-padding was needed.
|
||||
rs_p = 1;
|
||||
cs_p = bli_obj_packed_length( *p );
|
||||
cs_p = bli_obj_padded_length( *p );
|
||||
|
||||
bli_obj_set_incs( rs_p, cs_p, *p );
|
||||
}
|
||||
|
||||
@@ -48,7 +48,8 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t n_max,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2);
|
||||
@@ -68,8 +69,8 @@ void bli_packm_blk_var2( obj_t* beta,
|
||||
|
||||
dim_t m_p = bli_obj_length( *p );
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_packed_length( *p );
|
||||
dim_t n_max_p = bli_obj_packed_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
@@ -78,6 +79,7 @@ void bli_packm_blk_var2( obj_t* beta,
|
||||
void* buf_p = bli_obj_buffer_at_off( *p );
|
||||
inc_t rs_p = bli_obj_row_stride( *p );
|
||||
inc_t cs_p = bli_obj_col_stride( *p );
|
||||
dim_t pd_p = bli_obj_panel_dim( *p );
|
||||
inc_t ps_p = bli_obj_panel_stride( *p );
|
||||
|
||||
void* buf_beta = bli_obj_scalar_buffer( dt_cp, *beta );
|
||||
@@ -100,7 +102,8 @@ void bli_packm_blk_var2( obj_t* beta,
|
||||
n_max_p,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p, ps_p );
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p );
|
||||
}
|
||||
|
||||
|
||||
@@ -119,7 +122,8 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t n_max, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
@@ -190,12 +194,12 @@ void PASTEMAC(ch,varname )( \
|
||||
/* Prepare to pack to column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len = m; \
|
||||
panel_dim = rs_p; \
|
||||
panel_dim = pd_p; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim; \
|
||||
ldp = panel_dim; \
|
||||
ldp = rs_p; \
|
||||
m_panel = &m; \
|
||||
n_panel = &panel_dim_i; \
|
||||
m_panel_max = m_max; \
|
||||
@@ -206,12 +210,12 @@ void PASTEMAC(ch,varname )( \
|
||||
/* Prepare to pack to row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len = n; \
|
||||
panel_dim = cs_p; \
|
||||
panel_dim = pd_p; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim; \
|
||||
ldp = panel_dim; \
|
||||
ldp = cs_p; \
|
||||
m_panel = &panel_dim_i; \
|
||||
n_panel = &n; \
|
||||
m_panel_max = panel_dim; \
|
||||
@@ -433,7 +437,7 @@ void PASTEMAC(ch,varname )( \
|
||||
/*
|
||||
if ( rs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: a copied", m_panel_max, n_panel_max, \
|
||||
p_begin, 1, panel_dim, "%4.1f", "" ); \
|
||||
p_begin, 1, cs_p, "%4.1f", "" ); \
|
||||
if ( cs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \
|
||||
p_begin, panel_dim, 1, "%6.3f", "" ); \
|
||||
|
||||
@@ -52,7 +52,8 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_max, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_blk_var2 )
|
||||
|
||||
@@ -51,7 +51,8 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t n_max,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
|
||||
@@ -74,8 +75,8 @@ void bli_packm_blk_var3( obj_t* beta,
|
||||
|
||||
dim_t m_p = bli_obj_length( *p );
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_packed_length( *p );
|
||||
dim_t n_max_p = bli_obj_packed_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
@@ -84,6 +85,7 @@ void bli_packm_blk_var3( obj_t* beta,
|
||||
void* buf_p = bli_obj_buffer_at_off( *p );
|
||||
inc_t rs_p = bli_obj_row_stride( *p );
|
||||
inc_t cs_p = bli_obj_col_stride( *p );
|
||||
dim_t pd_p = bli_obj_panel_dim( *p );
|
||||
inc_t ps_p = bli_obj_panel_stride( *p );
|
||||
|
||||
void* buf_beta = bli_obj_scalar_buffer( dt_cp, *beta );
|
||||
@@ -109,7 +111,8 @@ void bli_packm_blk_var3( obj_t* beta,
|
||||
n_max_p,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p, ps_p );
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p );
|
||||
}
|
||||
|
||||
|
||||
@@ -131,7 +134,8 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t n_max, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
@@ -159,7 +163,7 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t p_inc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel; \
|
||||
dim_t* n_panel; \
|
||||
dim_t m_panel_use; \
|
||||
@@ -199,11 +203,12 @@ void PASTEMAC(ch,varname )( \
|
||||
iter_dim = n; \
|
||||
panel_len = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim = rs_p; \
|
||||
panel_dim = pd_p; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim; \
|
||||
ldp = rs_p; \
|
||||
m_panel = &m; \
|
||||
n_panel = &panel_dim_i; \
|
||||
} \
|
||||
@@ -213,11 +218,12 @@ void PASTEMAC(ch,varname )( \
|
||||
iter_dim = m; \
|
||||
panel_len = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim = cs_p; \
|
||||
panel_dim = pd_p; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim; \
|
||||
ldp = cs_p; \
|
||||
m_panel = &panel_dim_i; \
|
||||
n_panel = &n; \
|
||||
} \
|
||||
@@ -303,7 +309,7 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_len_i, \
|
||||
beta_cast, \
|
||||
c_use, incc, ldc, \
|
||||
p_use, panel_dim ); \
|
||||
p_use, ldp ); \
|
||||
\
|
||||
/* If the diagonal of C is implicitly unit, set the diagonal of
|
||||
the packed panel to unit. */ \
|
||||
@@ -351,7 +357,7 @@ void PASTEMAC(ch,varname )( \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
p_inc = panel_dim * panel_len_max_i; \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
@@ -369,9 +375,9 @@ void PASTEMAC(ch,varname )( \
|
||||
panel_len_i, \
|
||||
beta_cast, \
|
||||
c_use, incc, ldc, \
|
||||
p_use, panel_dim ); \
|
||||
p_use, ldp ); \
|
||||
\
|
||||
p_inc = panel_dim * panel_len_max_i; \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* If necessary, zero-pad at the edge of the panel dimension (ie: the
|
||||
@@ -382,7 +388,7 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t m_edge = panel_dim - i; \
|
||||
dim_t n_edge = panel_len_max_i; \
|
||||
inc_t rs_pe = 1; \
|
||||
inc_t cs_pe = panel_dim; \
|
||||
inc_t cs_pe = ldp; \
|
||||
ctype* p_edge = p_begin + (i )*rs_pe; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
@@ -402,7 +408,7 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t m_edge = panel_dim; \
|
||||
dim_t n_edge = panel_len_max_i - j; \
|
||||
inc_t rs_pe = 1; \
|
||||
inc_t cs_pe = panel_dim; \
|
||||
inc_t cs_pe = ldp; \
|
||||
ctype* p_edge = p_begin + (j )*cs_pe; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
@@ -427,7 +433,7 @@ void PASTEMAC(ch,varname )( \
|
||||
dim_t m_br = panel_dim - i; \
|
||||
dim_t n_br = panel_len_max_i - j; \
|
||||
inc_t rs_pe = 1; \
|
||||
inc_t cs_pe = panel_dim; \
|
||||
inc_t cs_pe = ldp; \
|
||||
ctype* p_edge = p_begin + (i )*rs_pe + (j )*cs_pe; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setd_unb_var1)( 0, \
|
||||
|
||||
@@ -55,7 +55,8 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_max, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_blk_var3 )
|
||||
|
||||
@@ -50,6 +50,9 @@ packm_t* packm_cntl_scale;
|
||||
blksz_t* packm_mult_ldim;
|
||||
blksz_t* packm_mult_nvec;
|
||||
|
||||
blksz_t* packm_mult_mext;
|
||||
blksz_t* packm_mult_next;
|
||||
|
||||
void bli_packm_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for m and n register blocking. We will attach
|
||||
@@ -70,6 +73,11 @@ void bli_packm_cntl_init()
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
// Create blocksize extensions that simply contain zero, as these
|
||||
// fields are not used except by level-3 operations.
|
||||
packm_mult_mext = bli_blksz_obj_create( 0, 0, 0, 0 );
|
||||
packm_mult_next = bli_blksz_obj_create( 0, 0, 0, 0 );
|
||||
|
||||
// Generally speaking, the BLIS_PACKED_ROWS and BLIS_PACKED_COLUMNS
|
||||
// are used by the level-2 operations, and thus densification is not
|
||||
// necessary. These schemas amount to simple copies to row or column
|
||||
@@ -89,7 +97,9 @@ void bli_packm_cntl_init()
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to rows:
|
||||
packm_mult_nvec, // - nvec multiple is used for m dimension
|
||||
packm_mult_mext, // - m extension is zero / unused
|
||||
packm_mult_ldim, // - ldim multiple is used for n dimension
|
||||
packm_mult_next, // - n extension is zero / unused
|
||||
FALSE, // do NOT scale
|
||||
FALSE, // do NOT densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -102,7 +112,9 @@ void bli_packm_cntl_init()
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to rows:
|
||||
packm_mult_nvec, // - nvec multiple is used for m dimension
|
||||
packm_mult_mext, // - m extension is zero / unused
|
||||
packm_mult_ldim, // - ldim multiple is used for n dimension
|
||||
packm_mult_next, // - n extension is zero / unused
|
||||
TRUE, // do scale
|
||||
FALSE, // do NOT densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -118,7 +130,9 @@ void bli_packm_cntl_init()
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to columns:
|
||||
packm_mult_ldim, // - ldim multiple is used for m dimension
|
||||
packm_mult_mext, // - m extension is zero / unused
|
||||
packm_mult_nvec, // - nvec multiple is used for n dimension
|
||||
packm_mult_next, // - n extension is zero / unused
|
||||
FALSE, // do NOT scale
|
||||
FALSE, // do NOT densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -131,7 +145,9 @@ void bli_packm_cntl_init()
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to columns:
|
||||
packm_mult_ldim, // - ldim multiple is used for m dimension
|
||||
packm_mult_mext, // - m extension is zero / unused
|
||||
packm_mult_nvec, // - nvec multiple is used for n dimension
|
||||
packm_mult_next, // - n extension is zero / unused
|
||||
TRUE, // do scale
|
||||
FALSE, // do NOT densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -141,64 +157,6 @@ void bli_packm_cntl_init()
|
||||
BLIS_BUFFER_FOR_GEN_USE );
|
||||
|
||||
|
||||
// Create control trees to pack by row panels (with and without scaling).
|
||||
packm_cntl_rpn_noscale
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to row panels:
|
||||
packm_mult_nvec, // - nvec multiple is used for panel length
|
||||
packm_mult_ldim, // - ldim multiple is used for panel width
|
||||
FALSE, // do NOT scale
|
||||
TRUE, // densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // do NOT iterate backwards if upper
|
||||
FALSE, // do NOT iterate backwards if lower
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_GEN_USE );
|
||||
packm_cntl_rpn_scale
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to row panels:
|
||||
packm_mult_nvec, // - nvec multiple is used for panel length
|
||||
packm_mult_ldim, // - ldim multiple is used for panel width
|
||||
TRUE, // do scale
|
||||
TRUE, // densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // do NOT iterate backwards if upper
|
||||
FALSE, // do NOT iterate backwards if lower
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_GEN_USE );
|
||||
|
||||
|
||||
// Create control trees to pack by column panels (with and without scaling).
|
||||
packm_cntl_cpn_noscale
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to column panels:
|
||||
packm_mult_ldim, // - ldim multiple is used for panel length
|
||||
packm_mult_nvec, // - nvec multiple is used for panel width
|
||||
FALSE, // do NOT scale
|
||||
TRUE, // densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // do NOT iterate backwards if upper
|
||||
FALSE, // do NOT iterate backwards if lower
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_GEN_USE );
|
||||
packm_cntl_cpn_scale
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1, // When packing to column panels:
|
||||
packm_mult_ldim, // - ldim multiple is used for panel length
|
||||
packm_mult_nvec, // - nvec multiple is used for panel width
|
||||
TRUE, // do scale
|
||||
TRUE, // densify structure
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // do NOT iterate backwards if upper
|
||||
FALSE, // do NOT iterate backwards if lower
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_GEN_USE );
|
||||
|
||||
|
||||
// Set defaults when we don't care whether the packing is by rows or
|
||||
// by columns.
|
||||
packm_cntl_noscale = packm_cntl_col_noscale;
|
||||
@@ -212,19 +170,16 @@ void bli_packm_cntl_finalize()
|
||||
bli_cntl_obj_free( packm_cntl_col_noscale );
|
||||
bli_cntl_obj_free( packm_cntl_col_scale );
|
||||
|
||||
bli_cntl_obj_free( packm_cntl_rpn_noscale );
|
||||
bli_cntl_obj_free( packm_cntl_rpn_scale );
|
||||
bli_cntl_obj_free( packm_cntl_cpn_noscale );
|
||||
bli_cntl_obj_free( packm_cntl_cpn_scale );
|
||||
|
||||
bli_blksz_obj_free( packm_mult_ldim );
|
||||
bli_blksz_obj_free( packm_mult_nvec );
|
||||
}
|
||||
|
||||
packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
|
||||
varnum_t var_num,
|
||||
blksz_t* mult_m,
|
||||
blksz_t* mult_n,
|
||||
blksz_t* mr_def,
|
||||
blksz_t* mr_ext,
|
||||
blksz_t* nr_def,
|
||||
blksz_t* nr_ext,
|
||||
bool_t does_scale,
|
||||
bool_t does_densify,
|
||||
bool_t does_invert_diag,
|
||||
@@ -239,8 +194,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
|
||||
|
||||
cntl->impl_type = impl_type;
|
||||
cntl->var_num = var_num;
|
||||
cntl->mult_m = mult_m;
|
||||
cntl->mult_n = mult_n;
|
||||
cntl->mr_def = mr_def;
|
||||
cntl->mr_ext = mr_ext;
|
||||
cntl->nr_def = nr_def;
|
||||
cntl->nr_ext = nr_ext;
|
||||
cntl->does_scale = does_scale;
|
||||
cntl->does_densify = does_densify;
|
||||
cntl->does_invert_diag = does_invert_diag;
|
||||
@@ -255,8 +212,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
|
||||
void bli_packm_cntl_obj_init( packm_t* cntl,
|
||||
impl_t impl_type,
|
||||
varnum_t var_num,
|
||||
blksz_t* mult_m,
|
||||
blksz_t* mult_n,
|
||||
blksz_t* mr_def,
|
||||
blksz_t* mr_ext,
|
||||
blksz_t* nr_def,
|
||||
blksz_t* nr_ext,
|
||||
bool_t does_scale,
|
||||
bool_t does_densify,
|
||||
bool_t does_invert_diag,
|
||||
@@ -267,8 +226,10 @@ void bli_packm_cntl_obj_init( packm_t* cntl,
|
||||
{
|
||||
cntl->impl_type = impl_type;
|
||||
cntl->var_num = var_num;
|
||||
cntl->mult_m = mult_m;
|
||||
cntl->mult_n = mult_n;
|
||||
cntl->mr_def = mr_def;
|
||||
cntl->mr_ext = mr_ext;
|
||||
cntl->nr_def = nr_def;
|
||||
cntl->nr_ext = nr_ext;
|
||||
cntl->does_scale = does_scale;
|
||||
cntl->does_densify = does_densify;
|
||||
cntl->does_invert_diag = does_invert_diag;
|
||||
|
||||
@@ -36,8 +36,10 @@ struct packm_s
|
||||
{
|
||||
impl_t impl_type;
|
||||
varnum_t var_num;
|
||||
blksz_t* mult_m;
|
||||
blksz_t* mult_n;
|
||||
blksz_t* mr_def;
|
||||
blksz_t* mr_ext;
|
||||
blksz_t* nr_def;
|
||||
blksz_t* nr_ext;
|
||||
bool_t does_scale;
|
||||
bool_t does_densify;
|
||||
bool_t does_invert_diag;
|
||||
@@ -48,8 +50,10 @@ struct packm_s
|
||||
};
|
||||
typedef struct packm_s packm_t;
|
||||
|
||||
#define cntl_mult_m( cntl ) cntl->mult_m
|
||||
#define cntl_mult_n( cntl ) cntl->mult_n
|
||||
#define cntl_mr_def( cntl ) cntl->mr_def
|
||||
#define cntl_mr_ext( cntl ) cntl->mr_ext
|
||||
#define cntl_nr_def( cntl ) cntl->nr_def
|
||||
#define cntl_nr_ext( cntl ) cntl->nr_ext
|
||||
|
||||
#define cntl_does_scale( cntl ) cntl->does_scale
|
||||
#define cntl_does_densify( cntl ) cntl->does_densify
|
||||
@@ -71,8 +75,10 @@ void bli_packm_cntl_init( void );
|
||||
void bli_packm_cntl_finalize( void );
|
||||
packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
|
||||
varnum_t var_num,
|
||||
blksz_t* mult_m,
|
||||
blksz_t* mult_n,
|
||||
blksz_t* mr_def,
|
||||
blksz_t* mr_ext,
|
||||
blksz_t* nr_def,
|
||||
blksz_t* nr_ext,
|
||||
bool_t does_scale,
|
||||
bool_t does_densify,
|
||||
bool_t does_invert_diag,
|
||||
@@ -83,8 +89,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
|
||||
void bli_packm_cntl_obj_init( packm_t* cntl,
|
||||
impl_t impl_type,
|
||||
varnum_t var_num,
|
||||
blksz_t* mult_m,
|
||||
blksz_t* mult_n,
|
||||
blksz_t* mr_def,
|
||||
blksz_t* mr_ext,
|
||||
blksz_t* nr_def,
|
||||
blksz_t* nr_ext,
|
||||
bool_t does_scale,
|
||||
bool_t does_densify,
|
||||
bool_t does_invert_diag,
|
||||
|
||||
@@ -50,8 +50,10 @@ void bli_packm_init( obj_t* a,
|
||||
packord_t pack_ord_if_up;
|
||||
packord_t pack_ord_if_lo;
|
||||
packbuf_t pack_buf_type;
|
||||
blksz_t* mult_m;
|
||||
blksz_t* mult_n;
|
||||
blksz_t* mr_def;
|
||||
blksz_t* mr_ext;
|
||||
blksz_t* nr_def;
|
||||
blksz_t* nr_ext;
|
||||
obj_t c;
|
||||
|
||||
// Check parameters.
|
||||
@@ -126,8 +128,10 @@ void bli_packm_init( obj_t* a,
|
||||
needs_densify = cntl_does_densify( cntl );
|
||||
pack_schema = cntl_pack_schema( cntl );
|
||||
pack_buf_type = cntl_pack_buf_type( cntl );
|
||||
mult_m = cntl_mult_m( cntl );
|
||||
mult_n = cntl_mult_n( cntl );
|
||||
mr_def = cntl_mr_def( cntl );
|
||||
mr_ext = cntl_mr_ext( cntl );
|
||||
nr_def = cntl_nr_def( cntl );
|
||||
nr_ext = cntl_nr_ext( cntl );
|
||||
|
||||
if ( cntl_does_invert_diag( cntl ) ) invert_diag = BLIS_INVERT_DIAG;
|
||||
else invert_diag = BLIS_NO_INVERT_DIAG;
|
||||
@@ -145,8 +149,8 @@ void bli_packm_init( obj_t* a,
|
||||
pack_ord_if_up,
|
||||
pack_ord_if_lo,
|
||||
pack_buf_type,
|
||||
mult_m,
|
||||
mult_n,
|
||||
mr_def, mr_ext,
|
||||
nr_def, nr_ext,
|
||||
&c,
|
||||
p );
|
||||
|
||||
@@ -160,8 +164,10 @@ void bli_packm_init_pack( bool_t densify,
|
||||
packord_t pack_ord_if_up,
|
||||
packord_t pack_ord_if_lo,
|
||||
packbuf_t pack_buf_type,
|
||||
blksz_t* mult_m,
|
||||
blksz_t* mult_n,
|
||||
blksz_t* mr_def,
|
||||
blksz_t* mr_ext,
|
||||
blksz_t* nr_def,
|
||||
blksz_t* nr_ext,
|
||||
obj_t* c,
|
||||
obj_t* p )
|
||||
{
|
||||
@@ -169,8 +175,13 @@ void bli_packm_init_pack( bool_t densify,
|
||||
trans_t transc = bli_obj_trans_status( *c );
|
||||
dim_t m_c = bli_obj_length( *c );
|
||||
dim_t n_c = bli_obj_width( *c );
|
||||
dim_t mult_m_dim = bli_blksz_for_type( datatype, mult_m );
|
||||
dim_t mult_n_dim = bli_blksz_for_type( datatype, mult_n );
|
||||
dim_t mr_def_dim = bli_blksz_for_type( datatype, mr_def );
|
||||
dim_t mr_ext_dim = bli_blksz_for_type( datatype, mr_ext );
|
||||
dim_t nr_def_dim = bli_blksz_for_type( datatype, nr_def );
|
||||
dim_t nr_ext_dim = bli_blksz_for_type( datatype, nr_ext );
|
||||
|
||||
dim_t mr_pack_dim = mr_def_dim + mr_ext_dim;
|
||||
dim_t nr_pack_dim = nr_def_dim + nr_ext_dim;
|
||||
|
||||
mem_t* mem_p;
|
||||
dim_t m_p_pad, n_p_pad;
|
||||
@@ -227,13 +238,13 @@ void bli_packm_init_pack( bool_t densify,
|
||||
// in p) and aligning them to the dimension multiples (typically equal
|
||||
// to register blocksizes). This does waste a little bit of space for
|
||||
// level-2 operations, but that's okay with us.
|
||||
m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mult_m_dim );
|
||||
n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), mult_n_dim );
|
||||
m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mr_def_dim );
|
||||
n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), nr_def_dim );
|
||||
|
||||
// Save the padded dimensions into the packed object. It is important
|
||||
// to save these dimensions since they represent the actual dimensions
|
||||
// of the zero-padded matrix.
|
||||
bli_obj_set_packed_dims( m_p_pad, n_p_pad, *p );
|
||||
bli_obj_set_padded_dims( m_p_pad, n_p_pad, *p );
|
||||
|
||||
// Now we prepare to compute strides, align them, and compute the
|
||||
// total number of bytes needed for the packed buffer. After that,
|
||||
@@ -294,13 +305,13 @@ void bli_packm_init_pack( bool_t densify,
|
||||
dim_t ps_p;
|
||||
|
||||
// The maximum panel length (for each datatype) should be equal to
|
||||
// the m dimension multiple.
|
||||
m_panel = mult_m_dim;
|
||||
// the register blocksize in the m dimension.
|
||||
m_panel = mr_def_dim;
|
||||
|
||||
// The "column stride" of a row panel packed object is interpreted as
|
||||
// the column stride WITHIN a panel. Thus, this is equal to the panel
|
||||
// length.
|
||||
cs_p = m_panel;
|
||||
// dimension plus an extension (which may be zero).
|
||||
cs_p = mr_pack_dim;
|
||||
|
||||
// The "row stride" of a row panel packed object is interpreted
|
||||
// as the row stride WITHIN a panel. Thus, it is unit.
|
||||
@@ -319,8 +330,9 @@ void bli_packm_init_pack( bool_t densify,
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE );
|
||||
|
||||
// Store the strides in p.
|
||||
// Store the strides and panel dimension in p.
|
||||
bli_obj_set_incs( rs_p, cs_p, *p );
|
||||
bli_obj_set_panel_dim( m_panel, *p );
|
||||
bli_obj_set_panel_stride( ps_p, *p );
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
@@ -332,13 +344,13 @@ void bli_packm_init_pack( bool_t densify,
|
||||
dim_t ps_p;
|
||||
|
||||
// The maximum panel width (for each datatype) should be equal to
|
||||
// the n dimension multiple.
|
||||
n_panel = mult_n_dim;
|
||||
// the register blocksize in the n dimension.
|
||||
n_panel = nr_def_dim;
|
||||
|
||||
// The "row stride" of a column panel packed object is interpreted as
|
||||
// the row stride WITHIN a panel. Thus, it is equal to the panel
|
||||
// width.
|
||||
rs_p = n_panel;
|
||||
// the row stride WITHIN a panel. Thus, this is equal to the panel
|
||||
// dimension plus an extension (which may be zero).
|
||||
rs_p = nr_pack_dim;
|
||||
|
||||
// The "column stride" of a column panel packed object is interpreted
|
||||
// as the column stride WITHIN a panel. Thus, it is unit.
|
||||
@@ -357,8 +369,9 @@ void bli_packm_init_pack( bool_t densify,
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE );
|
||||
|
||||
// Store the strides in p.
|
||||
// Store the strides and panel dimension in p.
|
||||
bli_obj_set_incs( rs_p, cs_p, *p );
|
||||
bli_obj_set_panel_dim( n_panel, *p );
|
||||
bli_obj_set_panel_stride( ps_p, *p );
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
|
||||
@@ -42,8 +42,10 @@ void bli_packm_init_pack( bool_t densify,
|
||||
packord_t pack_ord_if_up,
|
||||
packord_t pack_ord_if_lo,
|
||||
packbuf_t pack_buf_type,
|
||||
blksz_t* mult_m,
|
||||
blksz_t* mult_n,
|
||||
blksz_t* mr_def,
|
||||
blksz_t* mr_ext,
|
||||
blksz_t* nr_def,
|
||||
blksz_t* nr_ext,
|
||||
obj_t* c,
|
||||
obj_t* p );
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
|
||||
// Modify offsets and dimensions of requested partition.
|
||||
bli_obj_set_dims( b, n, *sub_obj );
|
||||
|
||||
// Tweak the packed length of the subpartition to trick the underlying
|
||||
// Tweak the padded length of the subpartition to trick the underlying
|
||||
// implementation into only zero-padding for the narrow submatrix of
|
||||
// interest. Usually, the value we want is b (for non-edge cases), but
|
||||
// at the edges, we want the remainder of the mem_t region in the m
|
||||
@@ -86,13 +86,13 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
|
||||
// b for the edge iteration). In these cases, we arrive at the new
|
||||
// packed length by simply subtracting off i.
|
||||
{
|
||||
dim_t m_pack_max = bli_obj_packed_length( *sub_obj );
|
||||
dim_t m_pack_max = bli_obj_padded_length( *sub_obj );
|
||||
dim_t m_pack_cur;
|
||||
|
||||
if ( i + b == m ) m_pack_cur = m_pack_max - i;
|
||||
else m_pack_cur = b;
|
||||
|
||||
bli_obj_set_packed_length( m_pack_cur, *sub_obj );
|
||||
bli_obj_set_padded_length( m_pack_cur, *sub_obj );
|
||||
}
|
||||
|
||||
// Translate the desired offsets to a panel offset and adjust the
|
||||
@@ -152,7 +152,7 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
|
||||
// Modify offsets and dimensions of requested partition.
|
||||
bli_obj_set_dims( m, b, *sub_obj );
|
||||
|
||||
// Tweak the packed width of the subpartition to trick the underlying
|
||||
// Tweak the padded width of the subpartition to trick the underlying
|
||||
// implementation into only zero-padding for the narrow submatrix of
|
||||
// interest. Usually, the value we want is b (for non-edge cases), but
|
||||
// at the edges, we want the remainder of the mem_t region in the n
|
||||
@@ -162,13 +162,13 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
|
||||
// b for the edge iteration). In these cases, we arrive at the new
|
||||
// packed width by simply subtracting off j.
|
||||
{
|
||||
dim_t n_pack_max = bli_obj_packed_width( *sub_obj );
|
||||
dim_t n_pack_max = bli_obj_padded_width( *sub_obj );
|
||||
dim_t n_pack_cur;
|
||||
|
||||
if ( j + b == n ) n_pack_cur = n_pack_max - j;
|
||||
else n_pack_cur = b;
|
||||
|
||||
bli_obj_set_packed_width( n_pack_cur, *sub_obj );
|
||||
bli_obj_set_padded_width( n_pack_cur, *sub_obj );
|
||||
}
|
||||
|
||||
// Translate the desired offsets to a panel offset and adjust the
|
||||
|
||||
@@ -70,8 +70,8 @@ void bli_packm_unb_var1( obj_t* beta,
|
||||
|
||||
dim_t m_p = bli_obj_length( *p );
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_packed_length( *p );
|
||||
dim_t n_max_p = bli_obj_packed_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
|
||||
@@ -71,8 +71,8 @@ void bli_packm_blk_var1( obj_t* beta,
|
||||
|
||||
dim_t m_p = bli_obj_length( *p );
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_packed_length( *p );
|
||||
dim_t n_max_p = bli_obj_packed_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
|
||||
@@ -54,87 +54,64 @@ blksz_t* gemm_kc;
|
||||
blksz_t* gemm_mr;
|
||||
blksz_t* gemm_nr;
|
||||
blksz_t* gemm_kr;
|
||||
blksz_t* gemm_extmr;
|
||||
blksz_t* gemm_extnr;
|
||||
blksz_t* gemm_extkr;
|
||||
blksz_t* gemm_ni;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
#define BLIS_GEMM_KC_S BLIS_DEFAULT_KC_S
|
||||
#define BLIS_GEMM_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_GEMM_KC_C BLIS_DEFAULT_KC_C
|
||||
#define BLIS_GEMM_KC_Z BLIS_DEFAULT_KC_Z
|
||||
|
||||
#define BLIS_GEMM_MC_S BLIS_DEFAULT_MC_S
|
||||
#define BLIS_GEMM_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_GEMM_MC_C BLIS_DEFAULT_MC_C
|
||||
#define BLIS_GEMM_MC_Z BLIS_DEFAULT_MC_Z
|
||||
|
||||
#define BLIS_GEMM_NC_S BLIS_DEFAULT_NC_S
|
||||
#define BLIS_GEMM_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_GEMM_NC_C BLIS_DEFAULT_NC_C
|
||||
#define BLIS_GEMM_NC_Z BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Register blocking
|
||||
|
||||
#define BLIS_GEMM_KR_S BLIS_DEFAULT_KR_S
|
||||
#define BLIS_GEMM_KR_D BLIS_DEFAULT_KR_D
|
||||
#define BLIS_GEMM_KR_C BLIS_DEFAULT_KR_C
|
||||
#define BLIS_GEMM_KR_Z BLIS_DEFAULT_KR_Z
|
||||
|
||||
#define BLIS_GEMM_MR_S BLIS_DEFAULT_MR_S
|
||||
#define BLIS_GEMM_MR_D BLIS_DEFAULT_MR_D
|
||||
#define BLIS_GEMM_MR_C BLIS_DEFAULT_MR_C
|
||||
#define BLIS_GEMM_MR_Z BLIS_DEFAULT_MR_Z
|
||||
|
||||
#define BLIS_GEMM_NR_S BLIS_DEFAULT_NR_S
|
||||
#define BLIS_GEMM_NR_D BLIS_DEFAULT_NR_D
|
||||
#define BLIS_GEMM_NR_C BLIS_DEFAULT_NR_C
|
||||
#define BLIS_GEMM_NR_Z BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Incremental pack blocking
|
||||
|
||||
#define BLIS_GEMM_NI_S BLIS_DEFAULT_NI_S
|
||||
#define BLIS_GEMM_NI_D BLIS_DEFAULT_NI_D
|
||||
#define BLIS_GEMM_NI_C BLIS_DEFAULT_NI_C
|
||||
#define BLIS_GEMM_NI_Z BLIS_DEFAULT_NI_Z
|
||||
|
||||
|
||||
void bli_gemm_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
gemm_mc = bli_blksz_obj_create( BLIS_GEMM_MC_S,
|
||||
BLIS_GEMM_MC_D,
|
||||
BLIS_GEMM_MC_C,
|
||||
BLIS_GEMM_MC_Z );
|
||||
gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
|
||||
BLIS_DEFAULT_MC_D,
|
||||
BLIS_DEFAULT_MC_C,
|
||||
BLIS_DEFAULT_MC_Z );
|
||||
|
||||
gemm_nc = bli_blksz_obj_create( BLIS_GEMM_NC_S,
|
||||
BLIS_GEMM_NC_D,
|
||||
BLIS_GEMM_NC_C,
|
||||
BLIS_GEMM_NC_Z );
|
||||
gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
|
||||
BLIS_DEFAULT_NC_D,
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
|
||||
gemm_kc = bli_blksz_obj_create( BLIS_GEMM_KC_S,
|
||||
BLIS_GEMM_KC_D,
|
||||
BLIS_GEMM_KC_C,
|
||||
BLIS_GEMM_KC_Z );
|
||||
gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
|
||||
BLIS_DEFAULT_KC_D,
|
||||
BLIS_DEFAULT_KC_C,
|
||||
BLIS_DEFAULT_KC_Z );
|
||||
|
||||
gemm_mr = bli_blksz_obj_create( BLIS_GEMM_MR_S,
|
||||
BLIS_GEMM_MR_D,
|
||||
BLIS_GEMM_MR_C,
|
||||
BLIS_GEMM_MR_Z );
|
||||
gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
BLIS_DEFAULT_MR_C,
|
||||
BLIS_DEFAULT_MR_Z );
|
||||
|
||||
gemm_nr = bli_blksz_obj_create( BLIS_GEMM_NR_S,
|
||||
BLIS_GEMM_NR_D,
|
||||
BLIS_GEMM_NR_C,
|
||||
BLIS_GEMM_NR_Z );
|
||||
gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
|
||||
BLIS_DEFAULT_NR_D,
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
gemm_kr = bli_blksz_obj_create( BLIS_GEMM_KR_S,
|
||||
BLIS_GEMM_KR_D,
|
||||
BLIS_GEMM_KR_C,
|
||||
BLIS_GEMM_KR_Z );
|
||||
gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
|
||||
BLIS_DEFAULT_KR_D,
|
||||
BLIS_DEFAULT_KR_C,
|
||||
BLIS_DEFAULT_KR_Z );
|
||||
|
||||
gemm_ni = bli_blksz_obj_create( BLIS_GEMM_NI_S,
|
||||
BLIS_GEMM_NI_D,
|
||||
BLIS_GEMM_NI_C,
|
||||
BLIS_GEMM_NI_Z );
|
||||
gemm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
|
||||
gemm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
|
||||
gemm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
|
||||
BLIS_EXTEND_KR_D,
|
||||
BLIS_EXTEND_KR_C,
|
||||
BLIS_EXTEND_KR_Z );
|
||||
|
||||
gemm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
|
||||
BLIS_DEFAULT_NI_D,
|
||||
BLIS_DEFAULT_NI_C,
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations on a, b, and c.
|
||||
@@ -142,8 +119,8 @@ void bli_gemm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm_mr,
|
||||
gemm_kr,
|
||||
gemm_mr, gemm_extmr,
|
||||
gemm_kr, gemm_extkr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -156,8 +133,8 @@ void bli_gemm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm_kr,
|
||||
gemm_nr,
|
||||
gemm_kr, gemm_extkr,
|
||||
gemm_nr, gemm_extnr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -170,8 +147,8 @@ void bli_gemm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm_mr,
|
||||
gemm_nr,
|
||||
gemm_mr, gemm_extmr,
|
||||
gemm_nr, gemm_extnr,
|
||||
FALSE, // do NOT scale by beta
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -144,7 +144,7 @@ void PASTEMAC(ch,varname)( \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,kc) * \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
@@ -239,10 +239,15 @@ void PASTEMAC(ch,varname)( \
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
|
||||
@@ -47,22 +47,20 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t m = PASTEMAC(ch,mr); \
|
||||
const dim_t n = PASTEMAC(ch,nr); \
|
||||
\
|
||||
const dim_t m = MR; \
|
||||
const dim_t n = NR; \
|
||||
const inc_t cs_a = PASTEMAC(ch,packmr); \
|
||||
\
|
||||
const inc_t cs_a = MR; \
|
||||
\
|
||||
const inc_t rs_b = NR; \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
\
|
||||
const inc_t rs_ab = 1; \
|
||||
const inc_t cs_ab = MR; \
|
||||
const inc_t cs_ab = PASTEMAC(ch,mr); \
|
||||
\
|
||||
dim_t k0, j0, i0; \
|
||||
\
|
||||
ctype ab[ MR * NR ]; \
|
||||
ctype ab[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ]; \
|
||||
ctype* restrict ab00; \
|
||||
ctype a0; \
|
||||
ctype b0; \
|
||||
|
||||
@@ -54,87 +54,64 @@ blksz_t* hemm_kc;
|
||||
blksz_t* hemm_mr;
|
||||
blksz_t* hemm_nr;
|
||||
blksz_t* hemm_kr;
|
||||
blksz_t* hemm_extmr;
|
||||
blksz_t* hemm_extnr;
|
||||
blksz_t* hemm_extkr;
|
||||
blksz_t* hemm_ni;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
#define BLIS_HEMM_KC_S BLIS_DEFAULT_KC_S
|
||||
#define BLIS_HEMM_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_HEMM_KC_C BLIS_DEFAULT_KC_C
|
||||
#define BLIS_HEMM_KC_Z BLIS_DEFAULT_KC_Z
|
||||
|
||||
#define BLIS_HEMM_MC_S BLIS_DEFAULT_MC_S
|
||||
#define BLIS_HEMM_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_HEMM_MC_C BLIS_DEFAULT_MC_C
|
||||
#define BLIS_HEMM_MC_Z BLIS_DEFAULT_MC_Z
|
||||
|
||||
#define BLIS_HEMM_NC_S BLIS_DEFAULT_NC_S
|
||||
#define BLIS_HEMM_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_HEMM_NC_C BLIS_DEFAULT_NC_C
|
||||
#define BLIS_HEMM_NC_Z BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Register blocking
|
||||
|
||||
#define BLIS_HEMM_KR_S BLIS_DEFAULT_KR_S
|
||||
#define BLIS_HEMM_KR_D BLIS_DEFAULT_KR_D
|
||||
#define BLIS_HEMM_KR_C BLIS_DEFAULT_KR_C
|
||||
#define BLIS_HEMM_KR_Z BLIS_DEFAULT_KR_Z
|
||||
|
||||
#define BLIS_HEMM_MR_S BLIS_DEFAULT_MR_S
|
||||
#define BLIS_HEMM_MR_D BLIS_DEFAULT_MR_D
|
||||
#define BLIS_HEMM_MR_C BLIS_DEFAULT_MR_C
|
||||
#define BLIS_HEMM_MR_Z BLIS_DEFAULT_MR_Z
|
||||
|
||||
#define BLIS_HEMM_NR_S BLIS_DEFAULT_NR_S
|
||||
#define BLIS_HEMM_NR_D BLIS_DEFAULT_NR_D
|
||||
#define BLIS_HEMM_NR_C BLIS_DEFAULT_NR_C
|
||||
#define BLIS_HEMM_NR_Z BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Incremental pack blocking
|
||||
|
||||
#define BLIS_HEMM_NI_S BLIS_DEFAULT_NI_S
|
||||
#define BLIS_HEMM_NI_D BLIS_DEFAULT_NI_D
|
||||
#define BLIS_HEMM_NI_C BLIS_DEFAULT_NI_C
|
||||
#define BLIS_HEMM_NI_Z BLIS_DEFAULT_NI_Z
|
||||
|
||||
|
||||
void bli_hemm_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
hemm_mc = bli_blksz_obj_create( BLIS_HEMM_MC_S,
|
||||
BLIS_HEMM_MC_D,
|
||||
BLIS_HEMM_MC_C,
|
||||
BLIS_HEMM_MC_Z );
|
||||
hemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
|
||||
BLIS_DEFAULT_MC_D,
|
||||
BLIS_DEFAULT_MC_C,
|
||||
BLIS_DEFAULT_MC_Z );
|
||||
|
||||
hemm_nc = bli_blksz_obj_create( BLIS_HEMM_NC_S,
|
||||
BLIS_HEMM_NC_D,
|
||||
BLIS_HEMM_NC_C,
|
||||
BLIS_HEMM_NC_Z );
|
||||
hemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
|
||||
BLIS_DEFAULT_NC_D,
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
|
||||
hemm_kc = bli_blksz_obj_create( BLIS_HEMM_KC_S,
|
||||
BLIS_HEMM_KC_D,
|
||||
BLIS_HEMM_KC_C,
|
||||
BLIS_HEMM_KC_Z );
|
||||
hemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
|
||||
BLIS_DEFAULT_KC_D,
|
||||
BLIS_DEFAULT_KC_C,
|
||||
BLIS_DEFAULT_KC_Z );
|
||||
|
||||
hemm_mr = bli_blksz_obj_create( BLIS_HEMM_MR_S,
|
||||
BLIS_HEMM_MR_D,
|
||||
BLIS_HEMM_MR_C,
|
||||
BLIS_HEMM_MR_Z );
|
||||
hemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
BLIS_DEFAULT_MR_C,
|
||||
BLIS_DEFAULT_MR_Z );
|
||||
|
||||
hemm_nr = bli_blksz_obj_create( BLIS_HEMM_NR_S,
|
||||
BLIS_HEMM_NR_D,
|
||||
BLIS_HEMM_NR_C,
|
||||
BLIS_HEMM_NR_Z );
|
||||
hemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
|
||||
BLIS_DEFAULT_NR_D,
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
hemm_kr = bli_blksz_obj_create( BLIS_HEMM_KR_S,
|
||||
BLIS_HEMM_KR_D,
|
||||
BLIS_HEMM_KR_C,
|
||||
BLIS_HEMM_KR_Z );
|
||||
hemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
|
||||
BLIS_DEFAULT_KR_D,
|
||||
BLIS_DEFAULT_KR_C,
|
||||
BLIS_DEFAULT_KR_Z );
|
||||
|
||||
hemm_ni = bli_blksz_obj_create( BLIS_HEMM_NI_S,
|
||||
BLIS_HEMM_NI_D,
|
||||
BLIS_HEMM_NI_C,
|
||||
BLIS_HEMM_NI_Z );
|
||||
hemm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
|
||||
hemm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
|
||||
hemm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
|
||||
BLIS_EXTEND_KR_D,
|
||||
BLIS_EXTEND_KR_C,
|
||||
BLIS_EXTEND_KR_Z );
|
||||
|
||||
hemm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
|
||||
BLIS_DEFAULT_NI_D,
|
||||
BLIS_DEFAULT_NI_C,
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations on a, b, and c.
|
||||
@@ -142,8 +119,8 @@ void bli_hemm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
hemm_mr,
|
||||
hemm_kr,
|
||||
hemm_mr, hemm_extmr,
|
||||
hemm_kr, hemm_extkr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -156,8 +133,8 @@ void bli_hemm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
hemm_kr,
|
||||
hemm_nr,
|
||||
hemm_kr, hemm_extkr,
|
||||
hemm_nr, hemm_extnr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -170,8 +147,8 @@ void bli_hemm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
hemm_mr,
|
||||
hemm_nr,
|
||||
hemm_mr, hemm_extmr,
|
||||
hemm_nr, hemm_extnr,
|
||||
FALSE, // do NOT scale by beta
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -55,87 +55,64 @@ blksz_t* her2k_kc;
|
||||
blksz_t* her2k_mr;
|
||||
blksz_t* her2k_nr;
|
||||
blksz_t* her2k_kr;
|
||||
blksz_t* her2k_extmr;
|
||||
blksz_t* her2k_extnr;
|
||||
blksz_t* her2k_extkr;
|
||||
blksz_t* her2k_ni;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
#define BLIS_HER2K_KC_S BLIS_DEFAULT_KC_S
|
||||
#define BLIS_HER2K_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_HER2K_KC_C BLIS_DEFAULT_KC_C
|
||||
#define BLIS_HER2K_KC_Z BLIS_DEFAULT_KC_Z
|
||||
|
||||
#define BLIS_HER2K_MC_S BLIS_DEFAULT_MC_S
|
||||
#define BLIS_HER2K_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_HER2K_MC_C BLIS_DEFAULT_MC_C
|
||||
#define BLIS_HER2K_MC_Z BLIS_DEFAULT_MC_Z
|
||||
|
||||
#define BLIS_HER2K_NC_S BLIS_DEFAULT_NC_S
|
||||
#define BLIS_HER2K_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_HER2K_NC_C BLIS_DEFAULT_NC_C
|
||||
#define BLIS_HER2K_NC_Z BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Register blocking
|
||||
|
||||
#define BLIS_HER2K_KR_S BLIS_DEFAULT_KR_S
|
||||
#define BLIS_HER2K_KR_D BLIS_DEFAULT_KR_D
|
||||
#define BLIS_HER2K_KR_C BLIS_DEFAULT_KR_C
|
||||
#define BLIS_HER2K_KR_Z BLIS_DEFAULT_KR_Z
|
||||
|
||||
#define BLIS_HER2K_MR_S BLIS_DEFAULT_MR_S
|
||||
#define BLIS_HER2K_MR_D BLIS_DEFAULT_MR_D
|
||||
#define BLIS_HER2K_MR_C BLIS_DEFAULT_MR_C
|
||||
#define BLIS_HER2K_MR_Z BLIS_DEFAULT_MR_Z
|
||||
|
||||
#define BLIS_HER2K_NR_S BLIS_DEFAULT_NR_S
|
||||
#define BLIS_HER2K_NR_D BLIS_DEFAULT_NR_D
|
||||
#define BLIS_HER2K_NR_C BLIS_DEFAULT_NR_C
|
||||
#define BLIS_HER2K_NR_Z BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Incremental pack blocking
|
||||
|
||||
#define BLIS_HER2K_NI_S BLIS_DEFAULT_NI_S
|
||||
#define BLIS_HER2K_NI_D BLIS_DEFAULT_NI_D
|
||||
#define BLIS_HER2K_NI_C BLIS_DEFAULT_NI_C
|
||||
#define BLIS_HER2K_NI_Z BLIS_DEFAULT_NI_Z
|
||||
|
||||
|
||||
void bli_her2k_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
her2k_mc = bli_blksz_obj_create( BLIS_HER2K_MC_S,
|
||||
BLIS_HER2K_MC_D,
|
||||
BLIS_HER2K_MC_C,
|
||||
BLIS_HER2K_MC_Z );
|
||||
her2k_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
|
||||
BLIS_DEFAULT_MC_D,
|
||||
BLIS_DEFAULT_MC_C,
|
||||
BLIS_DEFAULT_MC_Z );
|
||||
|
||||
her2k_nc = bli_blksz_obj_create( BLIS_HER2K_NC_S,
|
||||
BLIS_HER2K_NC_D,
|
||||
BLIS_HER2K_NC_C,
|
||||
BLIS_HER2K_NC_Z );
|
||||
her2k_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
|
||||
BLIS_DEFAULT_NC_D,
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
|
||||
her2k_kc = bli_blksz_obj_create( BLIS_HER2K_KC_S,
|
||||
BLIS_HER2K_KC_D,
|
||||
BLIS_HER2K_KC_C,
|
||||
BLIS_HER2K_KC_Z );
|
||||
her2k_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
|
||||
BLIS_DEFAULT_KC_D,
|
||||
BLIS_DEFAULT_KC_C,
|
||||
BLIS_DEFAULT_KC_Z );
|
||||
|
||||
her2k_mr = bli_blksz_obj_create( BLIS_HER2K_MR_S,
|
||||
BLIS_HER2K_MR_D,
|
||||
BLIS_HER2K_MR_C,
|
||||
BLIS_HER2K_MR_Z );
|
||||
her2k_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
BLIS_DEFAULT_MR_C,
|
||||
BLIS_DEFAULT_MR_Z );
|
||||
|
||||
her2k_nr = bli_blksz_obj_create( BLIS_HER2K_NR_S,
|
||||
BLIS_HER2K_NR_D,
|
||||
BLIS_HER2K_NR_C,
|
||||
BLIS_HER2K_NR_Z );
|
||||
her2k_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
|
||||
BLIS_DEFAULT_NR_D,
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
her2k_kr = bli_blksz_obj_create( BLIS_HER2K_KR_S,
|
||||
BLIS_HER2K_KR_D,
|
||||
BLIS_HER2K_KR_C,
|
||||
BLIS_HER2K_KR_Z );
|
||||
her2k_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
|
||||
BLIS_DEFAULT_KR_D,
|
||||
BLIS_DEFAULT_KR_C,
|
||||
BLIS_DEFAULT_KR_Z );
|
||||
|
||||
her2k_ni = bli_blksz_obj_create( BLIS_HER2K_NI_S,
|
||||
BLIS_HER2K_NI_D,
|
||||
BLIS_HER2K_NI_C,
|
||||
BLIS_HER2K_NI_Z );
|
||||
her2k_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
|
||||
her2k_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
|
||||
her2k_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
|
||||
BLIS_EXTEND_KR_D,
|
||||
BLIS_EXTEND_KR_C,
|
||||
BLIS_EXTEND_KR_Z );
|
||||
|
||||
her2k_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
|
||||
BLIS_DEFAULT_NI_D,
|
||||
BLIS_DEFAULT_NI_C,
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations on a, b, and c.
|
||||
@@ -143,8 +120,8 @@ void bli_her2k_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
her2k_mr,
|
||||
her2k_kr,
|
||||
her2k_mr, her2k_extmr,
|
||||
her2k_kr, her2k_extkr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -157,8 +134,8 @@ void bli_her2k_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
her2k_kr,
|
||||
her2k_nr,
|
||||
her2k_kr, her2k_extkr,
|
||||
her2k_nr, her2k_extnr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -171,8 +148,8 @@ void bli_her2k_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
her2k_mr,
|
||||
her2k_nr,
|
||||
her2k_mr, her2k_extmr,
|
||||
her2k_nr, her2k_extnr,
|
||||
FALSE, // do NOT scale by beta
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -54,87 +54,64 @@ blksz_t* herk_kc;
|
||||
blksz_t* herk_mr;
|
||||
blksz_t* herk_nr;
|
||||
blksz_t* herk_kr;
|
||||
blksz_t* herk_extmr;
|
||||
blksz_t* herk_extnr;
|
||||
blksz_t* herk_extkr;
|
||||
blksz_t* herk_ni;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
#define BLIS_HERK_KC_S BLIS_DEFAULT_KC_S
|
||||
#define BLIS_HERK_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_HERK_KC_C BLIS_DEFAULT_KC_C
|
||||
#define BLIS_HERK_KC_Z BLIS_DEFAULT_KC_Z
|
||||
|
||||
#define BLIS_HERK_MC_S BLIS_DEFAULT_MC_S
|
||||
#define BLIS_HERK_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_HERK_MC_C BLIS_DEFAULT_MC_C
|
||||
#define BLIS_HERK_MC_Z BLIS_DEFAULT_MC_Z
|
||||
|
||||
#define BLIS_HERK_NC_S BLIS_DEFAULT_NC_S
|
||||
#define BLIS_HERK_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_HERK_NC_C BLIS_DEFAULT_NC_C
|
||||
#define BLIS_HERK_NC_Z BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Register blocking
|
||||
|
||||
#define BLIS_HERK_KR_S BLIS_DEFAULT_KR_S
|
||||
#define BLIS_HERK_KR_D BLIS_DEFAULT_KR_D
|
||||
#define BLIS_HERK_KR_C BLIS_DEFAULT_KR_C
|
||||
#define BLIS_HERK_KR_Z BLIS_DEFAULT_KR_Z
|
||||
|
||||
#define BLIS_HERK_MR_S BLIS_DEFAULT_MR_S
|
||||
#define BLIS_HERK_MR_D BLIS_DEFAULT_MR_D
|
||||
#define BLIS_HERK_MR_C BLIS_DEFAULT_MR_C
|
||||
#define BLIS_HERK_MR_Z BLIS_DEFAULT_MR_Z
|
||||
|
||||
#define BLIS_HERK_NR_S BLIS_DEFAULT_NR_S
|
||||
#define BLIS_HERK_NR_D BLIS_DEFAULT_NR_D
|
||||
#define BLIS_HERK_NR_C BLIS_DEFAULT_NR_C
|
||||
#define BLIS_HERK_NR_Z BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Incremental pack blocking
|
||||
|
||||
#define BLIS_HERK_NI_S BLIS_DEFAULT_NI_S
|
||||
#define BLIS_HERK_NI_D BLIS_DEFAULT_NI_D
|
||||
#define BLIS_HERK_NI_C BLIS_DEFAULT_NI_C
|
||||
#define BLIS_HERK_NI_Z BLIS_DEFAULT_NI_Z
|
||||
|
||||
|
||||
void bli_herk_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
herk_mc = bli_blksz_obj_create( BLIS_HERK_MC_S,
|
||||
BLIS_HERK_MC_D,
|
||||
BLIS_HERK_MC_C,
|
||||
BLIS_HERK_MC_Z );
|
||||
herk_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
|
||||
BLIS_DEFAULT_MC_D,
|
||||
BLIS_DEFAULT_MC_C,
|
||||
BLIS_DEFAULT_MC_Z );
|
||||
|
||||
herk_nc = bli_blksz_obj_create( BLIS_HERK_NC_S,
|
||||
BLIS_HERK_NC_D,
|
||||
BLIS_HERK_NC_C,
|
||||
BLIS_HERK_NC_Z );
|
||||
herk_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
|
||||
BLIS_DEFAULT_NC_D,
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
|
||||
herk_kc = bli_blksz_obj_create( BLIS_HERK_KC_S,
|
||||
BLIS_HERK_KC_D,
|
||||
BLIS_HERK_KC_C,
|
||||
BLIS_HERK_KC_Z );
|
||||
herk_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
|
||||
BLIS_DEFAULT_KC_D,
|
||||
BLIS_DEFAULT_KC_C,
|
||||
BLIS_DEFAULT_KC_Z );
|
||||
|
||||
herk_mr = bli_blksz_obj_create( BLIS_HERK_MR_S,
|
||||
BLIS_HERK_MR_D,
|
||||
BLIS_HERK_MR_C,
|
||||
BLIS_HERK_MR_Z );
|
||||
herk_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
BLIS_DEFAULT_MR_C,
|
||||
BLIS_DEFAULT_MR_Z );
|
||||
|
||||
herk_nr = bli_blksz_obj_create( BLIS_HERK_NR_S,
|
||||
BLIS_HERK_NR_D,
|
||||
BLIS_HERK_NR_C,
|
||||
BLIS_HERK_NR_Z );
|
||||
herk_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
|
||||
BLIS_DEFAULT_NR_D,
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
herk_kr = bli_blksz_obj_create( BLIS_HERK_KR_S,
|
||||
BLIS_HERK_KR_D,
|
||||
BLIS_HERK_KR_C,
|
||||
BLIS_HERK_KR_Z );
|
||||
herk_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
|
||||
BLIS_DEFAULT_KR_D,
|
||||
BLIS_DEFAULT_KR_C,
|
||||
BLIS_DEFAULT_KR_Z );
|
||||
|
||||
herk_ni = bli_blksz_obj_create( BLIS_HERK_NI_S,
|
||||
BLIS_HERK_NI_D,
|
||||
BLIS_HERK_NI_C,
|
||||
BLIS_HERK_NI_Z );
|
||||
herk_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
|
||||
herk_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
|
||||
herk_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
|
||||
BLIS_EXTEND_KR_D,
|
||||
BLIS_EXTEND_KR_C,
|
||||
BLIS_EXTEND_KR_Z );
|
||||
|
||||
herk_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
|
||||
BLIS_DEFAULT_NI_D,
|
||||
BLIS_DEFAULT_NI_C,
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations on a, b, and c.
|
||||
@@ -142,8 +119,8 @@ void bli_herk_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
herk_mr,
|
||||
herk_kr,
|
||||
herk_mr, herk_extmr,
|
||||
herk_kr, herk_extkr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -156,8 +133,8 @@ void bli_herk_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
herk_kr,
|
||||
herk_nr,
|
||||
herk_kr, herk_extkr,
|
||||
herk_nr, herk_extnr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -170,8 +147,8 @@ void bli_herk_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
herk_mr,
|
||||
herk_nr,
|
||||
herk_mr, herk_extmr,
|
||||
herk_nr, herk_extnr,
|
||||
FALSE, // do NOT scale by beta
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -148,7 +148,7 @@ void PASTEMAC(ch,varname)( \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,kc) * \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
|
||||
@@ -148,7 +148,7 @@ void PASTEMAC(ch,varname)( \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,kc) * \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
|
||||
@@ -55,87 +55,64 @@ blksz_t* trmm_kc;
|
||||
blksz_t* trmm_mr;
|
||||
blksz_t* trmm_nr;
|
||||
blksz_t* trmm_kr;
|
||||
blksz_t* trmm_extmr;
|
||||
blksz_t* trmm_extnr;
|
||||
blksz_t* trmm_extkr;
|
||||
blksz_t* trmm_ni;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
#define BLIS_TRMM_KC_S BLIS_DEFAULT_KC_S
|
||||
#define BLIS_TRMM_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_TRMM_KC_C BLIS_DEFAULT_KC_C
|
||||
#define BLIS_TRMM_KC_Z BLIS_DEFAULT_KC_Z
|
||||
|
||||
#define BLIS_TRMM_MC_S BLIS_DEFAULT_MC_S
|
||||
#define BLIS_TRMM_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_TRMM_MC_C BLIS_DEFAULT_MC_C
|
||||
#define BLIS_TRMM_MC_Z BLIS_DEFAULT_MC_Z
|
||||
|
||||
#define BLIS_TRMM_NC_S BLIS_DEFAULT_NC_S
|
||||
#define BLIS_TRMM_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_TRMM_NC_C BLIS_DEFAULT_NC_C
|
||||
#define BLIS_TRMM_NC_Z BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Register blocking
|
||||
|
||||
#define BLIS_TRMM_KR_S BLIS_DEFAULT_KR_S
|
||||
#define BLIS_TRMM_KR_D BLIS_DEFAULT_KR_D
|
||||
#define BLIS_TRMM_KR_C BLIS_DEFAULT_KR_C
|
||||
#define BLIS_TRMM_KR_Z BLIS_DEFAULT_KR_Z
|
||||
|
||||
#define BLIS_TRMM_MR_S BLIS_DEFAULT_MR_S
|
||||
#define BLIS_TRMM_MR_D BLIS_DEFAULT_MR_D
|
||||
#define BLIS_TRMM_MR_C BLIS_DEFAULT_MR_C
|
||||
#define BLIS_TRMM_MR_Z BLIS_DEFAULT_MR_Z
|
||||
|
||||
#define BLIS_TRMM_NR_S BLIS_DEFAULT_NR_S
|
||||
#define BLIS_TRMM_NR_D BLIS_DEFAULT_NR_D
|
||||
#define BLIS_TRMM_NR_C BLIS_DEFAULT_NR_C
|
||||
#define BLIS_TRMM_NR_Z BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Incremental pack blocking
|
||||
|
||||
#define BLIS_TRMM_NI_S BLIS_DEFAULT_NI_S
|
||||
#define BLIS_TRMM_NI_D BLIS_DEFAULT_NI_D
|
||||
#define BLIS_TRMM_NI_C BLIS_DEFAULT_NI_C
|
||||
#define BLIS_TRMM_NI_Z BLIS_DEFAULT_NI_Z
|
||||
|
||||
|
||||
void bli_trmm_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
trmm_mc = bli_blksz_obj_create( BLIS_TRMM_MC_S,
|
||||
BLIS_TRMM_MC_D,
|
||||
BLIS_TRMM_MC_C,
|
||||
BLIS_TRMM_MC_Z );
|
||||
trmm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
|
||||
BLIS_DEFAULT_MC_D,
|
||||
BLIS_DEFAULT_MC_C,
|
||||
BLIS_DEFAULT_MC_Z );
|
||||
|
||||
trmm_nc = bli_blksz_obj_create( BLIS_TRMM_NC_S,
|
||||
BLIS_TRMM_NC_D,
|
||||
BLIS_TRMM_NC_C,
|
||||
BLIS_TRMM_NC_Z );
|
||||
trmm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
|
||||
BLIS_DEFAULT_NC_D,
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
|
||||
trmm_kc = bli_blksz_obj_create( BLIS_TRMM_KC_S,
|
||||
BLIS_TRMM_KC_D,
|
||||
BLIS_TRMM_KC_C,
|
||||
BLIS_TRMM_KC_Z );
|
||||
trmm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
|
||||
BLIS_DEFAULT_KC_D,
|
||||
BLIS_DEFAULT_KC_C,
|
||||
BLIS_DEFAULT_KC_Z );
|
||||
|
||||
trmm_mr = bli_blksz_obj_create( BLIS_TRMM_MR_S,
|
||||
BLIS_TRMM_MR_D,
|
||||
BLIS_TRMM_MR_C,
|
||||
BLIS_TRMM_MR_Z );
|
||||
trmm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
BLIS_DEFAULT_MR_C,
|
||||
BLIS_DEFAULT_MR_Z );
|
||||
|
||||
trmm_nr = bli_blksz_obj_create( BLIS_TRMM_NR_S,
|
||||
BLIS_TRMM_NR_D,
|
||||
BLIS_TRMM_NR_C,
|
||||
BLIS_TRMM_NR_Z );
|
||||
trmm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
|
||||
BLIS_DEFAULT_NR_D,
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
trmm_kr = bli_blksz_obj_create( BLIS_TRMM_KR_S,
|
||||
BLIS_TRMM_KR_D,
|
||||
BLIS_TRMM_KR_C,
|
||||
BLIS_TRMM_KR_Z );
|
||||
trmm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
|
||||
BLIS_DEFAULT_KR_D,
|
||||
BLIS_DEFAULT_KR_C,
|
||||
BLIS_DEFAULT_KR_Z );
|
||||
|
||||
trmm_ni = bli_blksz_obj_create( BLIS_TRMM_NI_S,
|
||||
BLIS_TRMM_NI_D,
|
||||
BLIS_TRMM_NI_C,
|
||||
BLIS_TRMM_NI_Z );
|
||||
trmm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
|
||||
trmm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
|
||||
trmm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
|
||||
BLIS_EXTEND_KR_D,
|
||||
BLIS_EXTEND_KR_C,
|
||||
BLIS_EXTEND_KR_Z );
|
||||
|
||||
trmm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
|
||||
BLIS_DEFAULT_NI_D,
|
||||
BLIS_DEFAULT_NI_C,
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations on a, b, and c.
|
||||
@@ -143,8 +120,10 @@ void bli_trmm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3, // pack panels of A compactly
|
||||
trmm_mr, // IMPORTANT: for consistency with trsm, "k" dim
|
||||
trmm_mr, // multiple is set to mr.
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
trmm_mr, trmm_extmr,
|
||||
trmm_mr, trmm_extmr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -157,8 +136,10 @@ void bli_trmm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
trmm_mr, // IMPORTANT: m dim multiple here must be mr
|
||||
trmm_nr, // since "k" dim multiple is set to mr above.
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
trmm_mr, trmm_extmr,
|
||||
trmm_nr, trmm_extnr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -171,8 +152,8 @@ void bli_trmm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
trmm_mr,
|
||||
trmm_nr,
|
||||
trmm_mr, trmm_extmr,
|
||||
trmm_nr, trmm_extmr,
|
||||
FALSE, // do NOT scale by beta
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -149,7 +149,7 @@ void PASTEMAC(ch,varname)( \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,kc) * \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
@@ -165,6 +165,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
@@ -252,7 +253,7 @@ void PASTEMAC(ch,varname)( \
|
||||
k_nr = k_a1011 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * MR; \
|
||||
rstep_a = k * PACKMR; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
@@ -334,7 +335,7 @@ void PASTEMAC(ch,varname)( \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
a1 += k_a1011 * PACKMR; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
|
||||
@@ -149,7 +149,7 @@ void PASTEMAC(ch,varname)( \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,kc) * \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
@@ -165,6 +165,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
@@ -252,7 +253,7 @@ void PASTEMAC(ch,varname)( \
|
||||
k_nr = k_a1112 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * MR; \
|
||||
rstep_a = k * PACKMR; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
@@ -337,7 +338,7 @@ void PASTEMAC(ch,varname)( \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1112 * MR; \
|
||||
a1 += k_a1112 * PACKMR; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
|
||||
@@ -55,87 +55,64 @@ blksz_t* trmm3_kc;
|
||||
blksz_t* trmm3_mr;
|
||||
blksz_t* trmm3_nr;
|
||||
blksz_t* trmm3_kr;
|
||||
blksz_t* trmm3_extmr;
|
||||
blksz_t* trmm3_extnr;
|
||||
blksz_t* trmm3_extkr;
|
||||
blksz_t* trmm3_ni;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
#define BLIS_TRMM3_KC_S BLIS_DEFAULT_KC_S
|
||||
#define BLIS_TRMM3_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_TRMM3_KC_C BLIS_DEFAULT_KC_C
|
||||
#define BLIS_TRMM3_KC_Z BLIS_DEFAULT_KC_Z
|
||||
|
||||
#define BLIS_TRMM3_MC_S BLIS_DEFAULT_MC_S
|
||||
#define BLIS_TRMM3_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_TRMM3_MC_C BLIS_DEFAULT_MC_C
|
||||
#define BLIS_TRMM3_MC_Z BLIS_DEFAULT_MC_Z
|
||||
|
||||
#define BLIS_TRMM3_NC_S BLIS_DEFAULT_NC_S
|
||||
#define BLIS_TRMM3_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_TRMM3_NC_C BLIS_DEFAULT_NC_C
|
||||
#define BLIS_TRMM3_NC_Z BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Register blocking
|
||||
|
||||
#define BLIS_TRMM3_KR_S BLIS_DEFAULT_KR_S
|
||||
#define BLIS_TRMM3_KR_D BLIS_DEFAULT_KR_D
|
||||
#define BLIS_TRMM3_KR_C BLIS_DEFAULT_KR_C
|
||||
#define BLIS_TRMM3_KR_Z BLIS_DEFAULT_KR_Z
|
||||
|
||||
#define BLIS_TRMM3_MR_S BLIS_DEFAULT_MR_S
|
||||
#define BLIS_TRMM3_MR_D BLIS_DEFAULT_MR_D
|
||||
#define BLIS_TRMM3_MR_C BLIS_DEFAULT_MR_C
|
||||
#define BLIS_TRMM3_MR_Z BLIS_DEFAULT_MR_Z
|
||||
|
||||
#define BLIS_TRMM3_NR_S BLIS_DEFAULT_NR_S
|
||||
#define BLIS_TRMM3_NR_D BLIS_DEFAULT_NR_D
|
||||
#define BLIS_TRMM3_NR_C BLIS_DEFAULT_NR_C
|
||||
#define BLIS_TRMM3_NR_Z BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Incremental pack blocking
|
||||
|
||||
#define BLIS_TRMM3_NI_S BLIS_DEFAULT_NI_S
|
||||
#define BLIS_TRMM3_NI_D BLIS_DEFAULT_NI_D
|
||||
#define BLIS_TRMM3_NI_C BLIS_DEFAULT_NI_C
|
||||
#define BLIS_TRMM3_NI_Z BLIS_DEFAULT_NI_Z
|
||||
|
||||
|
||||
void bli_trmm3_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
trmm3_mc = bli_blksz_obj_create( BLIS_TRMM3_MC_S,
|
||||
BLIS_TRMM3_MC_D,
|
||||
BLIS_TRMM3_MC_C,
|
||||
BLIS_TRMM3_MC_Z );
|
||||
trmm3_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
|
||||
BLIS_DEFAULT_MC_D,
|
||||
BLIS_DEFAULT_MC_C,
|
||||
BLIS_DEFAULT_MC_Z );
|
||||
|
||||
trmm3_nc = bli_blksz_obj_create( BLIS_TRMM3_NC_S,
|
||||
BLIS_TRMM3_NC_D,
|
||||
BLIS_TRMM3_NC_C,
|
||||
BLIS_TRMM3_NC_Z );
|
||||
trmm3_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
|
||||
BLIS_DEFAULT_NC_D,
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
|
||||
trmm3_kc = bli_blksz_obj_create( BLIS_TRMM3_KC_S,
|
||||
BLIS_TRMM3_KC_D,
|
||||
BLIS_TRMM3_KC_C,
|
||||
BLIS_TRMM3_KC_Z );
|
||||
trmm3_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
|
||||
BLIS_DEFAULT_KC_D,
|
||||
BLIS_DEFAULT_KC_C,
|
||||
BLIS_DEFAULT_KC_Z );
|
||||
|
||||
trmm3_mr = bli_blksz_obj_create( BLIS_TRMM3_MR_S,
|
||||
BLIS_TRMM3_MR_D,
|
||||
BLIS_TRMM3_MR_C,
|
||||
BLIS_TRMM3_MR_Z );
|
||||
trmm3_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
BLIS_DEFAULT_MR_C,
|
||||
BLIS_DEFAULT_MR_Z );
|
||||
|
||||
trmm3_nr = bli_blksz_obj_create( BLIS_TRMM3_NR_S,
|
||||
BLIS_TRMM3_NR_D,
|
||||
BLIS_TRMM3_NR_C,
|
||||
BLIS_TRMM3_NR_Z );
|
||||
trmm3_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
|
||||
BLIS_DEFAULT_NR_D,
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
trmm3_kr = bli_blksz_obj_create( BLIS_TRMM3_KR_S,
|
||||
BLIS_TRMM3_KR_D,
|
||||
BLIS_TRMM3_KR_C,
|
||||
BLIS_TRMM3_KR_Z );
|
||||
trmm3_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
|
||||
BLIS_DEFAULT_KR_D,
|
||||
BLIS_DEFAULT_KR_C,
|
||||
BLIS_DEFAULT_KR_Z );
|
||||
|
||||
trmm3_ni = bli_blksz_obj_create( BLIS_TRMM3_NI_S,
|
||||
BLIS_TRMM3_NI_D,
|
||||
BLIS_TRMM3_NI_C,
|
||||
BLIS_TRMM3_NI_Z );
|
||||
trmm3_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
|
||||
trmm3_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
|
||||
trmm3_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
|
||||
BLIS_EXTEND_KR_D,
|
||||
BLIS_EXTEND_KR_C,
|
||||
BLIS_EXTEND_KR_Z );
|
||||
|
||||
trmm3_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
|
||||
BLIS_DEFAULT_NI_D,
|
||||
BLIS_DEFAULT_NI_C,
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations on a, b, and c.
|
||||
@@ -143,8 +120,10 @@ void bli_trmm3_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3, // pack panels of A compactly
|
||||
trmm3_mr, // IMPORTANT: for consistency with trsm, "k" dim
|
||||
trmm3_mr, // multiple is set to mr.
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
trmm3_mr, trmm3_extmr,
|
||||
trmm3_mr, trmm3_extmr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -157,8 +136,10 @@ void bli_trmm3_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
trmm3_mr, // IMPORTANT: m dim multiple here must be mr
|
||||
trmm3_nr, // since "k" dim multiple is set to mr above.
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
trmm3_mr, trmm3_extmr,
|
||||
trmm3_nr, trmm3_extnr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -171,8 +152,8 @@ void bli_trmm3_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
trmm3_mr,
|
||||
trmm3_nr,
|
||||
trmm3_mr, trmm3_extmr,
|
||||
trmm3_nr, trmm3_extnr,
|
||||
FALSE, // do NOT scale by beta
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -55,87 +55,64 @@ blksz_t* trsm_kc;
|
||||
blksz_t* trsm_mr;
|
||||
blksz_t* trsm_nr;
|
||||
blksz_t* trsm_kr;
|
||||
blksz_t* trsm_extmr;
|
||||
blksz_t* trsm_extnr;
|
||||
blksz_t* trsm_extkr;
|
||||
blksz_t* trsm_ni;
|
||||
|
||||
// Cache blocksizes.
|
||||
|
||||
#define BLIS_TRSM_KC_S BLIS_DEFAULT_KC_S
|
||||
#define BLIS_TRSM_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_TRSM_KC_C BLIS_DEFAULT_KC_C
|
||||
#define BLIS_TRSM_KC_Z BLIS_DEFAULT_KC_Z
|
||||
|
||||
#define BLIS_TRSM_MC_S BLIS_DEFAULT_MC_S
|
||||
#define BLIS_TRSM_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_TRSM_MC_C BLIS_DEFAULT_MC_C
|
||||
#define BLIS_TRSM_MC_Z BLIS_DEFAULT_MC_Z
|
||||
|
||||
#define BLIS_TRSM_NC_S BLIS_DEFAULT_NC_S
|
||||
#define BLIS_TRSM_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_TRSM_NC_C BLIS_DEFAULT_NC_C
|
||||
#define BLIS_TRSM_NC_Z BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Register blocking
|
||||
|
||||
#define BLIS_TRSM_KR_S BLIS_DEFAULT_KR_S
|
||||
#define BLIS_TRSM_KR_D BLIS_DEFAULT_KR_D
|
||||
#define BLIS_TRSM_KR_C BLIS_DEFAULT_KR_C
|
||||
#define BLIS_TRSM_KR_Z BLIS_DEFAULT_KR_Z
|
||||
|
||||
#define BLIS_TRSM_MR_S BLIS_DEFAULT_MR_S
|
||||
#define BLIS_TRSM_MR_D BLIS_DEFAULT_MR_D
|
||||
#define BLIS_TRSM_MR_C BLIS_DEFAULT_MR_C
|
||||
#define BLIS_TRSM_MR_Z BLIS_DEFAULT_MR_Z
|
||||
|
||||
#define BLIS_TRSM_NR_S BLIS_DEFAULT_NR_S
|
||||
#define BLIS_TRSM_NR_D BLIS_DEFAULT_NR_D
|
||||
#define BLIS_TRSM_NR_C BLIS_DEFAULT_NR_C
|
||||
#define BLIS_TRSM_NR_Z BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Incremental pack blocking
|
||||
|
||||
#define BLIS_TRSM_NI_S BLIS_DEFAULT_NI_S
|
||||
#define BLIS_TRSM_NI_D BLIS_DEFAULT_NI_D
|
||||
#define BLIS_TRSM_NI_C BLIS_DEFAULT_NI_C
|
||||
#define BLIS_TRSM_NI_Z BLIS_DEFAULT_NI_Z
|
||||
|
||||
|
||||
void bli_trsm_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
trsm_mc = bli_blksz_obj_create( BLIS_TRSM_MC_S,
|
||||
BLIS_TRSM_MC_D,
|
||||
BLIS_TRSM_MC_C,
|
||||
BLIS_TRSM_MC_Z );
|
||||
trsm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
|
||||
BLIS_DEFAULT_MC_D,
|
||||
BLIS_DEFAULT_MC_C,
|
||||
BLIS_DEFAULT_MC_Z );
|
||||
|
||||
trsm_nc = bli_blksz_obj_create( BLIS_TRSM_NC_S,
|
||||
BLIS_TRSM_NC_D,
|
||||
BLIS_TRSM_NC_C,
|
||||
BLIS_TRSM_NC_Z );
|
||||
trsm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
|
||||
BLIS_DEFAULT_NC_D,
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
|
||||
trsm_kc = bli_blksz_obj_create( BLIS_TRSM_KC_S,
|
||||
BLIS_TRSM_KC_D,
|
||||
BLIS_TRSM_KC_C,
|
||||
BLIS_TRSM_KC_Z );
|
||||
trsm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
|
||||
BLIS_DEFAULT_KC_D,
|
||||
BLIS_DEFAULT_KC_C,
|
||||
BLIS_DEFAULT_KC_Z );
|
||||
|
||||
trsm_mr = bli_blksz_obj_create( BLIS_TRSM_MR_S,
|
||||
BLIS_TRSM_MR_D,
|
||||
BLIS_TRSM_MR_C,
|
||||
BLIS_TRSM_MR_Z );
|
||||
trsm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
|
||||
BLIS_DEFAULT_MR_D,
|
||||
BLIS_DEFAULT_MR_C,
|
||||
BLIS_DEFAULT_MR_Z );
|
||||
|
||||
trsm_nr = bli_blksz_obj_create( BLIS_TRSM_NR_S,
|
||||
BLIS_TRSM_NR_D,
|
||||
BLIS_TRSM_NR_C,
|
||||
BLIS_TRSM_NR_Z );
|
||||
trsm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
|
||||
BLIS_DEFAULT_NR_D,
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
|
||||
trsm_kr = bli_blksz_obj_create( BLIS_TRSM_KR_S,
|
||||
BLIS_TRSM_KR_D,
|
||||
BLIS_TRSM_KR_C,
|
||||
BLIS_TRSM_KR_Z );
|
||||
trsm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
|
||||
BLIS_DEFAULT_KR_D,
|
||||
BLIS_DEFAULT_KR_C,
|
||||
BLIS_DEFAULT_KR_Z );
|
||||
|
||||
trsm_ni = bli_blksz_obj_create( BLIS_TRSM_NI_S,
|
||||
BLIS_TRSM_NI_D,
|
||||
BLIS_TRSM_NI_C,
|
||||
BLIS_TRSM_NI_Z );
|
||||
trsm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
|
||||
trsm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
|
||||
trsm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
|
||||
BLIS_EXTEND_KR_D,
|
||||
BLIS_EXTEND_KR_C,
|
||||
BLIS_EXTEND_KR_Z );
|
||||
|
||||
trsm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
|
||||
BLIS_DEFAULT_NI_D,
|
||||
BLIS_DEFAULT_NI_C,
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations on a, b, and c.
|
||||
@@ -143,8 +120,10 @@ void bli_trsm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3, // pack panels of A compactly
|
||||
trsm_mr, // IMPORTANT: n dim multiple must be mr to
|
||||
trsm_mr, // support right and bottom-right edge cases
|
||||
// IMPORTANT: n dim multiple must be mr to
|
||||
// support right and bottom-right edge cases
|
||||
trsm_mr, trsm_extmr,
|
||||
trsm_mr, trsm_extmr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
TRUE, // densify
|
||||
TRUE, // invert diagonal
|
||||
@@ -157,8 +136,10 @@ void bli_trsm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
trsm_mr, // IMPORTANT: m dim multiple must be mr since
|
||||
trsm_nr, // B_pack is updated (ie: serves as C) in trsm
|
||||
// IMPORTANT: m dim multiple must be mr since
|
||||
// B_pack is updated (ie: serves as C) in trsm
|
||||
trsm_mr, trsm_extmr,
|
||||
trsm_nr, trsm_extnr,
|
||||
FALSE, // do NOT scale by alpha
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -171,8 +152,8 @@ void bli_trsm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
trsm_mr,
|
||||
trsm_nr,
|
||||
trsm_mr, trsm_extmr,
|
||||
trsm_nr, trsm_extnr,
|
||||
FALSE, // do NOT scale by beta
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -138,7 +138,7 @@ void PASTEMAC(ch,varname)( \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,kc) * \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
@@ -154,6 +154,8 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
@@ -252,7 +254,7 @@ void PASTEMAC(ch,varname)( \
|
||||
k_nr = k_a1011 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * MR; \
|
||||
rstep_a = k * PACKMR; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
@@ -305,14 +307,14 @@ void PASTEMAC(ch,varname)( \
|
||||
k_a1011 = bli_min( k, diagoffa_i + MR ); \
|
||||
k_a10 = k_a1011 - MR; \
|
||||
\
|
||||
b11 = b1 + diagoffa_i * NR; \
|
||||
b11 = b1 + diagoffa_i * PACKNR; \
|
||||
bp_i = bp + off_a1011 * NR * NDUP; \
|
||||
\
|
||||
/* Compute the addresses of the A10 panel and triangular
|
||||
block A11, and the corresponding panel Bd01 and block
|
||||
Bd11. */ \
|
||||
a10 = a1; \
|
||||
a11 = a1 + k_a10 * MR; \
|
||||
a11 = a1 + k_a10 * PACKMR; \
|
||||
bp01 = bp_i; \
|
||||
bp11 = bp_i + k_a10 * NR * NDUP; \
|
||||
\
|
||||
@@ -354,7 +356,7 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1011 * MR; \
|
||||
a1 += k_a1011 * PACKMR; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
|
||||
@@ -138,7 +138,7 @@ void PASTEMAC(ch,varname)( \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,kc) * \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
@@ -154,6 +154,8 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
@@ -252,7 +254,7 @@ void PASTEMAC(ch,varname)( \
|
||||
k_nr = k_a1112 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * MR; \
|
||||
rstep_a = k * PACKMR; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
@@ -310,14 +312,14 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Index into b1 (if the diagonal offset is positive) to
|
||||
locate the MR x NR block of b1 that will be updated by the
|
||||
trsm subproblem. */ \
|
||||
b11 = b1 + off_a1112 * NR; \
|
||||
b11 = b1 + off_a1112 * PACKNR; \
|
||||
bp_i = bp + off_a1112 * NR * NDUP; \
|
||||
\
|
||||
/* Compute the addresses of the A12 panel and triangular
|
||||
block A11, and the corresponding panel Bd21 and block
|
||||
Bd11. */ \
|
||||
a11 = a1; \
|
||||
a12 = a1 + k_a11 * MR; \
|
||||
a12 = a1 + k_a11 * PACKMR; \
|
||||
bp11 = bp_i; \
|
||||
bp21 = bp_i + k_a11 * NR * NDUP; \
|
||||
\
|
||||
@@ -374,7 +376,7 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += k_a1112 * MR; \
|
||||
a1 += k_a1112 * PACKMR; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
|
||||
@@ -49,9 +49,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
\
|
||||
const inc_t rs_b = NR; \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
|
||||
@@ -49,9 +49,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
\
|
||||
const inc_t rs_b = NR; \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
|
||||
@@ -45,16 +45,13 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
\
|
||||
const dim_t m = MR; \
|
||||
const dim_t n = NR; \
|
||||
const dim_t m = PASTEMAC(ch,mr); \
|
||||
const dim_t n = PASTEMAC(ch,nr); \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = MR; \
|
||||
const inc_t cs_a = PASTEMAC(ch,packmr); \
|
||||
\
|
||||
const inc_t rs_b = NR; \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, k; \
|
||||
|
||||
@@ -45,16 +45,13 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
\
|
||||
const dim_t m = MR; \
|
||||
const dim_t n = NR; \
|
||||
const dim_t m = PASTEMAC(ch,mr); \
|
||||
const dim_t n = PASTEMAC(ch,nr); \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = MR; \
|
||||
const inc_t cs_a = PASTEMAC(ch,packmr); \
|
||||
\
|
||||
const inc_t rs_b = NR; \
|
||||
const inc_t rs_b = PASTEMAC(ch,packnr); \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, k; \
|
||||
|
||||
@@ -81,11 +81,19 @@ dim_t bli_blksz_for_obj( obj_t* obj,
|
||||
return b->v[ bli_obj_datatype( *obj ) ];
|
||||
}
|
||||
|
||||
extern blksz_t* gemm_mc;
|
||||
extern blksz_t* gemm_nc;
|
||||
extern blksz_t* gemm_kc;
|
||||
extern blksz_t* gemm_mr;
|
||||
extern blksz_t* gemm_nr;
|
||||
extern blksz_t* gemm_kr;
|
||||
|
||||
dim_t bli_determine_blocksize_f( dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* obj,
|
||||
blksz_t* b )
|
||||
{
|
||||
#if 0
|
||||
num_t dt;
|
||||
dim_t b_alg;
|
||||
|
||||
@@ -103,7 +111,138 @@ dim_t bli_determine_blocksize_f( dim_t i,
|
||||
// smaller, in which case we return that remaining value.
|
||||
b_alg = bli_min( b_alg, dim - i );
|
||||
|
||||
//printf( "bli_determine_blocksize0: returning %lu\n", b_alg );
|
||||
|
||||
return b_alg;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
num_t dt;
|
||||
dim_t b_alg, b_now;
|
||||
dim_t mc, nc, kc;
|
||||
dim_t mr, nr, kr;
|
||||
dim_t dim_left_now;
|
||||
|
||||
dt = bli_obj_execution_datatype( *obj );
|
||||
b_alg = bli_blksz_for_type( dt, b );
|
||||
|
||||
mc = bli_blksz_for_type( dt, gemm_mc );
|
||||
nc = bli_blksz_for_type( dt, gemm_nc );
|
||||
kc = bli_blksz_for_type( dt, gemm_kc );
|
||||
|
||||
mr = bli_blksz_for_type( dt, gemm_mr );
|
||||
nr = bli_blksz_for_type( dt, gemm_nr );
|
||||
kr = bli_blksz_for_type( dt, gemm_kr );
|
||||
|
||||
dim_left_now = dim - i;
|
||||
|
||||
if ( dim_left_now <= b_alg )
|
||||
{
|
||||
b_now = dim_left_now;
|
||||
}
|
||||
else if ( dim_left_now <= b_alg + (b_alg/4) )
|
||||
{
|
||||
b_now = dim_left_now / 2;
|
||||
|
||||
// This actually wno't work when, for example, mc == kc but mr != kr.
|
||||
if ( b_alg == mc ) b_now = bli_align_dim_to_mult( b_now, mr );
|
||||
else if ( b_alg == nc ) b_now = bli_align_dim_to_mult( b_now, nr );
|
||||
else if ( b_alg == kc ) b_now = bli_align_dim_to_mult( b_now, kr );
|
||||
}
|
||||
else
|
||||
{
|
||||
b_now = b_alg;
|
||||
}
|
||||
|
||||
//printf( "bli_determine_blocksize1: returning %lu\n", b_now );
|
||||
|
||||
return b_now;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
num_t dt;
|
||||
dim_t b_alg, b_now;
|
||||
dim_t mc, nc, kc;
|
||||
dim_t mr, nr, kr;
|
||||
dim_t dim_left_now;
|
||||
|
||||
dt = bli_obj_execution_datatype( *obj );
|
||||
b_alg = bli_blksz_for_type( dt, b );
|
||||
|
||||
mc = bli_blksz_for_type( dt, gemm_mc );
|
||||
nc = bli_blksz_for_type( dt, gemm_nc );
|
||||
kc = bli_blksz_for_type( dt, gemm_kc );
|
||||
|
||||
mr = bli_blksz_for_type( dt, gemm_mr );
|
||||
nr = bli_blksz_for_type( dt, gemm_nr );
|
||||
kr = bli_blksz_for_type( dt, gemm_kr );
|
||||
|
||||
dim_left_now = dim - i;
|
||||
|
||||
if ( dim_left_now <= b_alg )
|
||||
{
|
||||
b_now = dim_left_now;
|
||||
}
|
||||
else if ( dim_left_now <= 2 * b_alg )
|
||||
{
|
||||
b_now = dim_left_now / 2;
|
||||
|
||||
// This actually wno't work when, for example, mc == kc but mr != kr.
|
||||
if ( b_alg == mc ) b_now = bli_align_dim_to_mult( b_now, mr );
|
||||
else if ( b_alg == nc ) b_now = bli_align_dim_to_mult( b_now, nr );
|
||||
else if ( b_alg == kc ) b_now = bli_align_dim_to_mult( b_now, kr );
|
||||
}
|
||||
else
|
||||
{
|
||||
b_now = b_alg;
|
||||
}
|
||||
|
||||
//printf( "bli_determine_blocksize2: returning %lu\n", b_now );
|
||||
|
||||
return b_now;
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_EDGECASE_HACK
|
||||
num_t dt;
|
||||
dim_t b_alg, b_now;
|
||||
dim_t dim_left_now;
|
||||
|
||||
dt = bli_obj_execution_datatype( *obj );
|
||||
b_alg = bli_blksz_for_type( dt, b );
|
||||
|
||||
dim_left_now = dim - i;
|
||||
|
||||
if ( dim_left_now <= b_alg + b_alg/4 )
|
||||
{
|
||||
b_now = dim_left_now;
|
||||
}
|
||||
else
|
||||
{
|
||||
b_now = b_alg;
|
||||
}
|
||||
|
||||
return b_now;
|
||||
#else
|
||||
num_t dt;
|
||||
dim_t b_alg;
|
||||
|
||||
// We assume that this function is being called from an algorithm that
|
||||
// is moving "forward" (ie: top to bottom, left to right, top-left
|
||||
// to bottom-right).
|
||||
|
||||
// Extract the execution datatype and use it to query the corresponding
|
||||
// blocksize value from the blksz_t object.
|
||||
dt = bli_obj_execution_datatype( *obj );
|
||||
b_alg = bli_blksz_for_type( dt, b );
|
||||
|
||||
// If we are moving "forward" (ie: top to bottom, left to right, or
|
||||
// top-left to bottom-right), then return b_alg, unless dim - 1 is
|
||||
// smaller, in which case we return that remaining value.
|
||||
b_alg = bli_min( b_alg, dim - i );
|
||||
|
||||
|
||||
return b_alg;
|
||||
#endif
|
||||
}
|
||||
|
||||
dim_t bli_determine_blocksize_b( dim_t i,
|
||||
|
||||
@@ -37,9 +37,9 @@
|
||||
|
||||
// Define the size of pool blocks. These may be adjusted so that they can
|
||||
// handle inflated blocksizes at edge cases.
|
||||
#define BLIS_POOL_MC_D BLIS_DEFAULT_MC_D
|
||||
#define BLIS_POOL_KC_D BLIS_DEFAULT_KC_D
|
||||
#define BLIS_POOL_NC_D BLIS_DEFAULT_NC_D
|
||||
#define BLIS_POOL_MC_D ( ( BLIS_MAXIMUM_MC_D * BLIS_PACKDIM_MR_D ) / BLIS_DEFAULT_MR_D )
|
||||
#define BLIS_POOL_KC_D ( ( BLIS_MAXIMUM_KC_D * BLIS_PACKDIM_KR_D ) / BLIS_DEFAULT_KR_D )
|
||||
#define BLIS_POOL_NC_D ( ( BLIS_MAXIMUM_NC_D * BLIS_PACKDIM_NR_D ) / BLIS_DEFAULT_NR_D )
|
||||
|
||||
// Define each pool's block size.
|
||||
// NOTE: Here we assume the "worst" case of the register blocking
|
||||
|
||||
@@ -510,8 +510,8 @@ void bli_obj_print( char* label, obj_t* obj )
|
||||
fprintf( file, " - buf %p\n", bli_mem_buffer( pack_mem ) );
|
||||
fprintf( file, " - buf_type %u\n", bli_mem_buf_type( pack_mem ) );
|
||||
fprintf( file, " - size %lu\n", bli_mem_size( pack_mem ) );
|
||||
fprintf( file, " m_packed %lu\n", bli_obj_packed_length( *obj ) );
|
||||
fprintf( file, " n_packed %lu\n", bli_obj_packed_width( *obj ) );
|
||||
fprintf( file, " m_padded %lu\n", bli_obj_padded_length( *obj ) );
|
||||
fprintf( file, " n_padded %lu\n", bli_obj_padded_width( *obj ) );
|
||||
fprintf( file, " ps %lu\n", bli_obj_panel_stride( *obj ) );
|
||||
fprintf( file, "\n" );
|
||||
|
||||
|
||||
@@ -35,52 +35,214 @@
|
||||
#ifndef BLIS_KERNEL_MACRO_DEFS_H
|
||||
#define BLIS_KERNEL_MACRO_DEFS_H
|
||||
|
||||
#define SIZEOF_S 4
|
||||
#define SIZEOF_D 8
|
||||
#define SIZEOF_C 8
|
||||
#define SIZEOF_Z 16
|
||||
|
||||
// Redefine kernel blocksizes, defined in bli_kernel.h, to shorter
|
||||
// names that can be derived via PASTEMAC macro.
|
||||
|
||||
// Cache blocksizes
|
||||
// -- Kernel macro checks ------------------------------------------------------
|
||||
|
||||
#define bli_smc BLIS_DEFAULT_MC_S
|
||||
#define bli_snc BLIS_DEFAULT_NC_S
|
||||
#define bli_skc BLIS_DEFAULT_KC_S
|
||||
// Verify that cache blocksizes are whole multiples of register blocksizes.
|
||||
// Specifically, verify that:
|
||||
// - MC is a whole multiple of MR.
|
||||
// - NC is a whole multiple of NR.
|
||||
// - KC is a whole multiple of KR.
|
||||
// These constraints are enforced because it makes it easier to handle diagonals
|
||||
// in the macro-kernel implementations.
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_MC_S % BLIS_DEFAULT_MR_S != 0 ) || \
|
||||
( BLIS_DEFAULT_MC_D % BLIS_DEFAULT_MR_D != 0 ) || \
|
||||
( BLIS_DEFAULT_MC_C % BLIS_DEFAULT_MR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_MC_Z % BLIS_DEFAULT_MR_Z != 0 ) \
|
||||
)
|
||||
#error MC must be multiple of MR for all datatypes.
|
||||
#endif
|
||||
|
||||
#define bli_dmc BLIS_DEFAULT_MC_D
|
||||
#define bli_dnc BLIS_DEFAULT_NC_D
|
||||
#define bli_dkc BLIS_DEFAULT_KC_D
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_NC_S % BLIS_DEFAULT_NR_S != 0 ) || \
|
||||
( BLIS_DEFAULT_NC_D % BLIS_DEFAULT_NR_D != 0 ) || \
|
||||
( BLIS_DEFAULT_NC_C % BLIS_DEFAULT_NR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_NC_Z % BLIS_DEFAULT_NR_Z != 0 ) \
|
||||
)
|
||||
#error NC must be multiple of NR for all datatypes.
|
||||
#endif
|
||||
|
||||
#define bli_cmc BLIS_DEFAULT_MC_C
|
||||
#define bli_cnc BLIS_DEFAULT_NC_C
|
||||
#define bli_ckc BLIS_DEFAULT_KC_C
|
||||
#if ( \
|
||||
( BLIS_DEFAULT_KC_S % BLIS_DEFAULT_KR_S != 0 ) || \
|
||||
( BLIS_DEFAULT_KC_D % BLIS_DEFAULT_KR_D != 0 ) || \
|
||||
( BLIS_DEFAULT_KC_C % BLIS_DEFAULT_KR_C != 0 ) || \
|
||||
( BLIS_DEFAULT_KC_Z % BLIS_DEFAULT_KR_Z != 0 ) \
|
||||
)
|
||||
#error KC must be multiple of KR for all datatypes.
|
||||
#endif
|
||||
|
||||
#define bli_zmc BLIS_DEFAULT_MC_Z
|
||||
#define bli_znc BLIS_DEFAULT_NC_Z
|
||||
#define bli_zkc BLIS_DEFAULT_KC_Z
|
||||
// Verify that cache blocksizes indicate consistent storage.
|
||||
// Specifically, verify that:
|
||||
// - MC_D * KC_D >= MC_? * KC_?.
|
||||
// - KC_D * NC_D >= KC_? * NC_?.
|
||||
// - MC_D * NC_D >= MC_? * NC_?.
|
||||
// These constraints are enforced because static memory is allocated for the
|
||||
// contiguous memory allocator using the double-precision real values of MC,
|
||||
// NC, and KC.
|
||||
#if ( \
|
||||
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_MC_S * BLIS_DEFAULT_KC_S * SIZEOF_S ) ) || \
|
||||
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_MC_C * BLIS_DEFAULT_KC_C * SIZEOF_C ) ) || \
|
||||
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_MC_Z * BLIS_DEFAULT_KC_Z * SIZEOF_Z ) ) \
|
||||
)
|
||||
#error MC_D*KC_D must be >= that of MC*KC for all other datatypes.
|
||||
#endif
|
||||
|
||||
#if ( \
|
||||
( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_KC_S * BLIS_DEFAULT_NC_S * SIZEOF_S ) ) || \
|
||||
( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_KC_C * BLIS_DEFAULT_NC_C * SIZEOF_C ) ) || \
|
||||
( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_KC_Z * BLIS_DEFAULT_NC_Z * SIZEOF_Z ) ) \
|
||||
)
|
||||
#error KC_D*NC_D must be >= that of KC*NC for all other datatypes.
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if ( \
|
||||
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_MC_S * BLIS_DEFAULT_NC_S * SIZEOF_S ) ) || \
|
||||
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_MC_C * BLIS_DEFAULT_NC_C * SIZEOF_C ) ) || \
|
||||
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
|
||||
( BLIS_DEFAULT_MC_Z * BLIS_DEFAULT_NC_Z * SIZEOF_Z ) ) \
|
||||
)
|
||||
#error MC_D*NC_D must be >= that of MC*NC for all other datatypes.
|
||||
#endif
|
||||
*/
|
||||
|
||||
|
||||
// -- Compute maximum cache blocksizes -----------------------------------------
|
||||
|
||||
#define BLIS_MAXIMUM_MC_S ( BLIS_DEFAULT_MC_S + BLIS_EXTEND_MC_S )
|
||||
#define BLIS_MAXIMUM_KC_S ( BLIS_DEFAULT_KC_S + BLIS_EXTEND_KC_S )
|
||||
#define BLIS_MAXIMUM_NC_S ( BLIS_DEFAULT_NC_S + BLIS_EXTEND_NC_S )
|
||||
|
||||
#define BLIS_MAXIMUM_MC_D ( BLIS_DEFAULT_MC_D + BLIS_EXTEND_MC_D )
|
||||
#define BLIS_MAXIMUM_KC_D ( BLIS_DEFAULT_KC_D + BLIS_EXTEND_KC_D )
|
||||
#define BLIS_MAXIMUM_NC_D ( BLIS_DEFAULT_NC_D + BLIS_EXTEND_NC_D )
|
||||
|
||||
#define BLIS_MAXIMUM_MC_C ( BLIS_DEFAULT_MC_C + BLIS_EXTEND_MC_C )
|
||||
#define BLIS_MAXIMUM_KC_C ( BLIS_DEFAULT_KC_C + BLIS_EXTEND_KC_C )
|
||||
#define BLIS_MAXIMUM_NC_C ( BLIS_DEFAULT_NC_C + BLIS_EXTEND_NC_C )
|
||||
|
||||
#define BLIS_MAXIMUM_MC_Z ( BLIS_DEFAULT_MC_Z + BLIS_EXTEND_MC_Z )
|
||||
#define BLIS_MAXIMUM_KC_Z ( BLIS_DEFAULT_KC_Z + BLIS_EXTEND_KC_Z )
|
||||
#define BLIS_MAXIMUM_NC_Z ( BLIS_DEFAULT_NC_Z + BLIS_EXTEND_NC_Z )
|
||||
|
||||
|
||||
// -- Compute leading dim blocksizes used for packing --------------------------
|
||||
|
||||
#define BLIS_PACKDIM_MR_S ( BLIS_DEFAULT_MR_S + BLIS_EXTEND_MR_S )
|
||||
#define BLIS_PACKDIM_KR_S ( BLIS_DEFAULT_KR_S + BLIS_EXTEND_KR_S )
|
||||
#define BLIS_PACKDIM_NR_S ( BLIS_DEFAULT_NR_S + BLIS_EXTEND_NR_S )
|
||||
|
||||
#define BLIS_PACKDIM_MR_D ( BLIS_DEFAULT_MR_D + BLIS_EXTEND_MR_D )
|
||||
#define BLIS_PACKDIM_KR_D ( BLIS_DEFAULT_KR_D + BLIS_EXTEND_KR_D )
|
||||
#define BLIS_PACKDIM_NR_D ( BLIS_DEFAULT_NR_D + BLIS_EXTEND_NR_D )
|
||||
|
||||
#define BLIS_PACKDIM_MR_C ( BLIS_DEFAULT_MR_C + BLIS_EXTEND_MR_C )
|
||||
#define BLIS_PACKDIM_KR_C ( BLIS_DEFAULT_KR_C + BLIS_EXTEND_KR_C )
|
||||
#define BLIS_PACKDIM_NR_C ( BLIS_DEFAULT_NR_C + BLIS_EXTEND_NR_C )
|
||||
|
||||
#define BLIS_PACKDIM_MR_Z ( BLIS_DEFAULT_MR_Z + BLIS_EXTEND_MR_Z )
|
||||
#define BLIS_PACKDIM_KR_Z ( BLIS_DEFAULT_KR_Z + BLIS_EXTEND_KR_Z )
|
||||
#define BLIS_PACKDIM_NR_Z ( BLIS_DEFAULT_NR_Z + BLIS_EXTEND_NR_Z )
|
||||
|
||||
|
||||
// -- Abbreiviated kernel blocksize macros -------------------------------------
|
||||
|
||||
// Here, we shorten the blocksizes defined in bli_kernel.h so that they can
|
||||
// derived via the PASTEMAC macro.
|
||||
|
||||
// Default cache blocksizes
|
||||
|
||||
#define bli_smc BLIS_DEFAULT_MC_S
|
||||
#define bli_skc BLIS_DEFAULT_KC_S
|
||||
#define bli_snc BLIS_DEFAULT_NC_S
|
||||
|
||||
#define bli_dmc BLIS_DEFAULT_MC_D
|
||||
#define bli_dkc BLIS_DEFAULT_KC_D
|
||||
#define bli_dnc BLIS_DEFAULT_NC_D
|
||||
|
||||
#define bli_cmc BLIS_DEFAULT_MC_C
|
||||
#define bli_ckc BLIS_DEFAULT_KC_C
|
||||
#define bli_cnc BLIS_DEFAULT_NC_C
|
||||
|
||||
#define bli_zmc BLIS_DEFAULT_MC_Z
|
||||
#define bli_zkc BLIS_DEFAULT_KC_Z
|
||||
#define bli_znc BLIS_DEFAULT_NC_Z
|
||||
|
||||
// Maximum cache blocksizes
|
||||
|
||||
#define bli_smaxmc BLIS_MAXIMUM_MC_S
|
||||
#define bli_smaxkc BLIS_MAXIMUM_KC_S
|
||||
#define bli_smaxnc BLIS_MAXIMUM_NC_S
|
||||
|
||||
#define bli_dmaxmc BLIS_MAXIMUM_MC_D
|
||||
#define bli_dmaxkc BLIS_MAXIMUM_KC_D
|
||||
#define bli_dmaxnc BLIS_MAXIMUM_NC_D
|
||||
|
||||
#define bli_cmaxmc BLIS_MAXIMUM_MC_C
|
||||
#define bli_cmaxkc BLIS_MAXIMUM_KC_C
|
||||
#define bli_cmaxnc BLIS_MAXIMUM_NC_C
|
||||
|
||||
#define bli_zmaxmc BLIS_MAXIMUM_MC_Z
|
||||
#define bli_zmaxkc BLIS_MAXIMUM_KC_Z
|
||||
#define bli_zmaxnc BLIS_MAXIMUM_NC_Z
|
||||
|
||||
// Register blocksizes
|
||||
|
||||
#define bli_smr BLIS_DEFAULT_MR_S
|
||||
#define bli_snr BLIS_DEFAULT_NR_S
|
||||
#define bli_skr BLIS_DEFAULT_KR_S
|
||||
#define bli_smr BLIS_DEFAULT_MR_S
|
||||
#define bli_skr BLIS_DEFAULT_KR_S
|
||||
#define bli_snr BLIS_DEFAULT_NR_S
|
||||
|
||||
#define bli_dmr BLIS_DEFAULT_MR_D
|
||||
#define bli_dnr BLIS_DEFAULT_NR_D
|
||||
#define bli_dkr BLIS_DEFAULT_KR_D
|
||||
#define bli_dmr BLIS_DEFAULT_MR_D
|
||||
#define bli_dkr BLIS_DEFAULT_KR_D
|
||||
#define bli_dnr BLIS_DEFAULT_NR_D
|
||||
|
||||
#define bli_cmr BLIS_DEFAULT_MR_C
|
||||
#define bli_cnr BLIS_DEFAULT_NR_C
|
||||
#define bli_ckr BLIS_DEFAULT_KR_C
|
||||
#define bli_cmr BLIS_DEFAULT_MR_C
|
||||
#define bli_ckr BLIS_DEFAULT_KR_C
|
||||
#define bli_cnr BLIS_DEFAULT_NR_C
|
||||
|
||||
#define bli_zmr BLIS_DEFAULT_MR_Z
|
||||
#define bli_znr BLIS_DEFAULT_NR_Z
|
||||
#define bli_zkr BLIS_DEFAULT_KR_Z
|
||||
#define bli_zmr BLIS_DEFAULT_MR_Z
|
||||
#define bli_zkr BLIS_DEFAULT_KR_Z
|
||||
#define bli_znr BLIS_DEFAULT_NR_Z
|
||||
|
||||
// Duplication
|
||||
// Micro-panel packing register blocksizes
|
||||
|
||||
#define bli_spackmr BLIS_PACKDIM_MR_S
|
||||
#define bli_spackkr BLIS_PACKDIM_KR_S
|
||||
#define bli_spacknr BLIS_PACKDIM_NR_S
|
||||
|
||||
#define bli_dpackmr BLIS_PACKDIM_MR_D
|
||||
#define bli_dpackkr BLIS_PACKDIM_KR_D
|
||||
#define bli_dpacknr BLIS_PACKDIM_NR_D
|
||||
|
||||
#define bli_cpackmr BLIS_PACKDIM_MR_C
|
||||
#define bli_cpackkr BLIS_PACKDIM_KR_C
|
||||
#define bli_cpacknr BLIS_PACKDIM_NR_C
|
||||
|
||||
#define bli_zpackmr BLIS_PACKDIM_MR_Z
|
||||
#define bli_zpackkr BLIS_PACKDIM_KR_Z
|
||||
#define bli_zpacknr BLIS_PACKDIM_NR_Z
|
||||
|
||||
// Duplication factors
|
||||
|
||||
#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S
|
||||
#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D
|
||||
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
|
||||
|
||||
#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S
|
||||
#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D
|
||||
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -710,30 +710,44 @@ bli_obj_width_stored( obj )
|
||||
|
||||
// Packed dimensions query
|
||||
|
||||
#define bli_obj_packed_length( obj ) \
|
||||
#define bli_obj_padded_length( obj ) \
|
||||
\
|
||||
( (obj).m_packed )
|
||||
( (obj).m_padded )
|
||||
|
||||
#define bli_obj_packed_width( obj ) \
|
||||
#define bli_obj_padded_width( obj ) \
|
||||
\
|
||||
( (obj).n_packed )
|
||||
( (obj).n_padded )
|
||||
|
||||
// Packed dimensions modification
|
||||
|
||||
#define bli_obj_set_packed_length( m0, obj ) \
|
||||
#define bli_obj_set_padded_length( m0, obj ) \
|
||||
{ \
|
||||
(obj).m_packed = m0; \
|
||||
(obj).m_padded = m0; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_packed_width( n0, obj ) \
|
||||
#define bli_obj_set_padded_width( n0, obj ) \
|
||||
{ \
|
||||
(obj).n_packed = n0; \
|
||||
(obj).n_padded = n0; \
|
||||
}
|
||||
|
||||
#define bli_obj_set_packed_dims( m0, n0, obj ) \
|
||||
#define bli_obj_set_padded_dims( m0, n0, obj ) \
|
||||
{ \
|
||||
bli_obj_set_packed_length( m0, obj ); \
|
||||
bli_obj_set_packed_width( n0, obj ); \
|
||||
bli_obj_set_padded_length( m0, obj ); \
|
||||
bli_obj_set_padded_width( n0, obj ); \
|
||||
}
|
||||
|
||||
|
||||
// Packed panel dimension query
|
||||
|
||||
#define bli_obj_panel_dim( obj ) \
|
||||
\
|
||||
((obj).pd)
|
||||
|
||||
// Packed panel dimension modification
|
||||
|
||||
#define bli_obj_set_panel_dim( panel_dim, obj ) \
|
||||
{ \
|
||||
(obj).pd = panel_dim; \
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -388,9 +388,11 @@ typedef struct obj_s
|
||||
|
||||
// Pack-related fields
|
||||
mem_t pack_mem; // cached memory region for packing
|
||||
dim_t m_packed;
|
||||
dim_t n_packed;
|
||||
dim_t m_padded; // m dimension of matrix, including any padding
|
||||
dim_t n_padded; // n dimension of matrix, including any padding
|
||||
inc_t ps; // panel stride (distance to next panel)
|
||||
inc_t pd; // panel dimension (the "width" of a panel:
|
||||
// usually MR or NR)
|
||||
|
||||
//mem_t cast_mem; // cached memory region for casting
|
||||
|
||||
@@ -445,8 +447,9 @@ typedef struct obj_s
|
||||
those situations, we want the subpartition to inherit the pack_mem
|
||||
field, and the corresponding packed dimensions, of its parent. */ \
|
||||
(b).pack_mem = (a).pack_mem; \
|
||||
(b).m_packed = (a).m_packed; \
|
||||
(b).n_packed = (a).n_packed; \
|
||||
(b).m_padded = (a).m_padded; \
|
||||
(b).n_padded = (a).n_padded; \
|
||||
(b).pd = (a).pd; \
|
||||
(b).ps = (a).ps; \
|
||||
\
|
||||
/*(b).cast_mem = (a).cast_mem;*/ \
|
||||
|
||||
@@ -240,8 +240,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_a =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
mr,
|
||||
kr,
|
||||
mr, NULL,
|
||||
kr, NULL,
|
||||
TRUE, // scale?
|
||||
TRUE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
@@ -252,8 +252,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_b =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
kr,
|
||||
nr,
|
||||
kr, NULL,
|
||||
nr, NULL,
|
||||
FALSE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
|
||||
@@ -38,12 +38,11 @@
|
||||
// transa transb m n k alpha a lda b ldb beta c ldc
|
||||
void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* );
|
||||
|
||||
//#define PRINT
|
||||
#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t a_pack, b_pack;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, n, k;
|
||||
@@ -54,6 +53,9 @@ int main( int argc, char** argv )
|
||||
num_t dt_alpha, dt_beta;
|
||||
int r, n_repeats;
|
||||
|
||||
#if 0
|
||||
obj_t a_pack, b_pack;
|
||||
|
||||
blksz_t* mr;
|
||||
blksz_t* nr;
|
||||
blksz_t* kr;
|
||||
@@ -70,6 +72,7 @@ int main( int argc, char** argv )
|
||||
gemm_t* gemm_cntl_op_bp;
|
||||
gemm_t* gemm_cntl_mm_op;
|
||||
gemm_t* gemm_cntl_vl_mm;
|
||||
#endif
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
@@ -132,6 +135,7 @@ int main( int argc, char** argv )
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
bli_setsc( -(1.0/1.0), 0.0, &beta );
|
||||
|
||||
#if 0
|
||||
mr = bli_blksz_obj_create( 2, 4, 2, 2 );
|
||||
kr = bli_blksz_obj_create( 1, 1, 1, 1 );
|
||||
nr = bli_blksz_obj_create( 1, 4, 1, 1 );
|
||||
@@ -215,7 +219,7 @@ int main( int argc, char** argv )
|
||||
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b_pack );
|
||||
|
||||
#endif
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
@@ -291,6 +295,7 @@ int main( int argc, char** argv )
|
||||
printf( "( %2ld, 1:5 ) = [ %4lu %4lu %4lu %10.3e %6.3f ];\n",
|
||||
(p - p_begin + 1)/p_inc + 1, m, k, n, dtime_save, gflops );
|
||||
|
||||
#if 0
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b_pack );
|
||||
|
||||
@@ -309,6 +314,7 @@ int main( int argc, char** argv )
|
||||
bli_cntl_obj_free( gemm_cntl_op_bp );
|
||||
bli_cntl_obj_free( gemm_cntl_mm_op );
|
||||
bli_cntl_obj_free( gemm_cntl_vl_mm );
|
||||
#endif
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
@@ -149,8 +149,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_a =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
mr,
|
||||
kr,
|
||||
mr, NULL,
|
||||
kr, NULL,
|
||||
FALSE, // scale?
|
||||
TRUE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
@@ -162,8 +162,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_b =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
kr,
|
||||
nr,
|
||||
kr, NULL,
|
||||
nr, NULL,
|
||||
FALSE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
|
||||
@@ -146,8 +146,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_a =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
mr,
|
||||
kr,
|
||||
mr, NULL,
|
||||
kr, NULL,
|
||||
FALSE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
@@ -159,8 +159,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_b =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
kr,
|
||||
nr,
|
||||
kr, NULL,
|
||||
nr, NULL,
|
||||
FALSE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
|
||||
@@ -142,8 +142,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_a =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
mr,
|
||||
kr,
|
||||
mr, NULL,
|
||||
kr, NULL,
|
||||
FALSE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
@@ -155,8 +155,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_b =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
kr,
|
||||
nr,
|
||||
kr, NULL,
|
||||
nr, NULL,
|
||||
FALSE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
|
||||
@@ -162,8 +162,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_a =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
mr, // IMPORTANT: for consistency with trsm, "k" dim
|
||||
mr, // multiple is set to mr.
|
||||
mr, NULL, // IMPORTANT: for consistency with trsm, "k" dim
|
||||
mr, NULL, // multiple is set to mr.
|
||||
FALSE, // scale?
|
||||
TRUE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
@@ -175,8 +175,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_b =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
mr, // IMPORTANT: m dim multiple here must be mr
|
||||
nr, // since "k" dim multiple is set to mr above.
|
||||
mr, NULL, // IMPORTANT: m dim multiple here must be mr
|
||||
nr, NULL, // since "k" dim multiple is set to mr above.
|
||||
FALSE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
|
||||
@@ -151,8 +151,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_a =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
mr, // IMPORTANT: "k" dim multiple must be mr to
|
||||
mr, // support using ukernel for right/bottom-right
|
||||
mr, NULL, // IMPORTANT: "k" dim multiple must be mr to
|
||||
mr, NULL, // support using ukernel for right/bottom-right
|
||||
// edge cases (see macro-kernel for comments).
|
||||
FALSE, // scale?
|
||||
TRUE, // densify?
|
||||
@@ -165,8 +165,8 @@ int main( int argc, char** argv )
|
||||
packm_cntl_b =
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
mr, // IMPORTANT: m dim multiple here must be mr
|
||||
nr, // since "k" dim multiple is set to mr above.
|
||||
mr, NULL, // IMPORTANT: m dim multiple here must be mr
|
||||
nr, NULL, // since "k" dim multiple is set to mr above.
|
||||
TRUE, // scale?
|
||||
FALSE, // densify?
|
||||
FALSE, // invert diagonal?
|
||||
|
||||
@@ -490,6 +490,23 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
BLIS_DEFAULT_NC_C,
|
||||
BLIS_DEFAULT_NC_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-3 cache blksz exts s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
|
||||
BLIS_EXTEND_MC_S,
|
||||
BLIS_EXTEND_MC_D,
|
||||
BLIS_EXTEND_MC_C,
|
||||
BLIS_EXTEND_MC_Z );
|
||||
libblis_test_fprintf_c( os, " k dimension %5u %5u %5u %5u\n",
|
||||
BLIS_EXTEND_KC_S,
|
||||
BLIS_EXTEND_KC_D,
|
||||
BLIS_EXTEND_KC_C,
|
||||
BLIS_EXTEND_KC_Z );
|
||||
libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n",
|
||||
BLIS_EXTEND_NC_S,
|
||||
BLIS_EXTEND_NC_D,
|
||||
BLIS_EXTEND_NC_C,
|
||||
BLIS_EXTEND_NC_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-3 register blocksizes \n" );
|
||||
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_MR_S,
|
||||
@@ -502,6 +519,18 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
BLIS_DEFAULT_NR_C,
|
||||
BLIS_DEFAULT_NR_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-3 register blksz exts s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
|
||||
BLIS_EXTEND_MR_S,
|
||||
BLIS_EXTEND_MR_D,
|
||||
BLIS_EXTEND_MR_C,
|
||||
BLIS_EXTEND_MR_Z );
|
||||
libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n",
|
||||
BLIS_EXTEND_NR_S,
|
||||
BLIS_EXTEND_NR_D,
|
||||
BLIS_EXTEND_NR_C,
|
||||
BLIS_EXTEND_NR_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-3 packing duplication \n" );
|
||||
libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_NUM_DUPL_S,
|
||||
|
||||
Reference in New Issue
Block a user