Allow ldim of packed micro-panels != MR, NR.

Details:
- Made substantial changes throughout the framework to decouple the leading
  dimension (row or column stride) used within each packed micro-panel from
  the corresponding register blocksize. It appears advantageous on some
  systems to use, for example, packed micro-panels of A where the column
  stride is greater than MR (whereas previously it was always equal to MR).
- Changes include:
  - Added BLIS_EXTEND_[MNK]R_? macros, which specify how much extra padding
    to use when packing micro-panels of A and B.
  - Adjusted all packing routines and macro-kernels to use PACKMR and PACKNR
    where appropriate, instead of MR and NR.
  - Added pd field (panel dimension) to obj_t.
  - New interface to bli_packm_cntl_obj_create().
  - Renamed bli_obj_packed_length()/_width() macros to
    bli_obj_padded_length()/_width().
  - Removed local #defines for cache/register blocksizes in level-3 *_cntl.c.
  - Print out new cache and register blocksize extensions in test suite.
- Also added new BLIS_EXTEND_[MNK]C_? macros for future use in using a larger
  blocksize for edge cases, which can improve performance at the margins.
This commit is contained in:
Field G. Van Zee
2013-04-21 15:00:24 -05:00
parent 59fca58dbe
commit b6ef84fad1
51 changed files with 1669 additions and 799 deletions

View File

@@ -77,7 +77,29 @@
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 8192
// -- Default register blocksizes for inner kernel --
// -- Ccache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above.
// NOTE: These values are not yet used.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
@@ -104,6 +126,31 @@
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
@@ -162,7 +209,7 @@
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_NC_Z 1000

137
config/flame/bli_config.h Normal file
View File

@@ -0,0 +1,137 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// -- OPERATING SYSTEM ---------------------------------------------------------
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
#define BLIS_NUM_FP_TYPES 4
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 24
// -- MEMORY ALLOCATION --------------------------------------------------------
// -- Contiguous (static) memory allocator --
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS 1
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
// memory pools so that the micro-kernel, when computing with the end of the
// last block, can exceed the bounds of the usable portion of the memory
// region without causing a segmentation fault.
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
// -- Memory alignment --
// It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size
// and the page size.
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE 16
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE 16
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE 16
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE 16
// Alignment size used when sizing strides (eg: of packed micro-panels)
// within a block of contiguous memory.
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
// Basic (homogeneous) datatype support always enabled.
// Enable mixed domain operations?
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
// Enable extra mixed precision operations?
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
// Stay initialized after auto-initialization, unless and until the user
// explicitly calls bli_finalize().
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
// Enable the BLAS compatibility layer?
#define BLIS_ENABLE_BLAS2BLIS
// Fortran-77 name-mangling macros.
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
#endif

335
config/flame/bli_kernel.h Normal file
View File

@@ -0,0 +1,335 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (3) KC must be a multiple of
// (a) MR and
// (b) NR
// for triangular operations such as trmm and trsm.
//
// NOTE: For BLIS libraries built on block-panel macro-kernels, constraint (3b)
// is relaxed. In this case, (3a) is needed for operations where matrix A is
// triangular (trmm, trsm), because we want the diagonal offset of any packed
// panel of matrix A to be a multiple of MR. If, instead, the library were to
// be built on block-panel macro-kernels, the matrix with structure would be
// on the right, rather than the left, and thus it would be constraint (3b)
// that would be needed instead of (3a).
//
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
#define BLIS_DEFAULT_MC_D 128
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
//#define BLIS_EDGECASE_HACK 1
// -- Default register blocksizes for inner kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 2
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount
// of duplication needed when configuring level-3 macro-kernels that
// copy and duplicate elements of B to a temporary duplication buffer
// (so that element-wise vector multiplication and addition instructions
// can be used).
#define BLIS_NUM_ELEM_PER_REG_S 4
#define BLIS_NUM_ELEM_PER_REG_D 2
#define BLIS_NUM_ELEM_PER_REG_C 2
#define BLIS_NUM_ELEM_PER_REG_Z 1
// -- Default switch for duplication of B --
// NOTE: Setting these values to 1 disables duplication. Any value
// d > 1 results in a d-1 duplicates created within special macro-kernel
// buffer of dimension k x NR*d.
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
#define BLIS_DEFAULT_NUM_DUPL_S 1
#define BLIS_DEFAULT_NUM_DUPL_D 2
#define BLIS_DEFAULT_NUM_DUPL_C 1
#define BLIS_DEFAULT_NUM_DUPL_Z 1
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
// used by certain blocked variants. But when the *are* used, they MUST be
// be an integer multiple of NR!
#define BLIS_DEFAULT_NI_FAC 16
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSING_FACTOR_S 8
#define BLIS_DEFAULT_FUSING_FACTOR_D 4
#define BLIS_DEFAULT_FUSING_FACTOR_C 4
#define BLIS_DEFAULT_FUSING_FACTOR_Z 2
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
#include "bli_gemm_opt_d4x2.h"
// -- dupl --
#define DUPL_KERNEL dupl_unb_var1
// -- gemm --
//#define GEMM_UKERNEL gemm_ref_4x4
#define GEMM_UKERNEL gemm_opt_d4x2
// -- trsm-related --
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_4x4
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_4x4
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
//#define TRSM_L_UKERNEL trsm_l_ref_4x4
//#define TRSM_U_UKERNEL trsm_u_ref_4x4
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copynzv --
#define COPYNZV_KERNEL copynzv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

1
config/flame/kernels Symbolic link
View File

@@ -0,0 +1 @@
../../kernels/x86/3/

104
config/flame/make_defs.mk Normal file
View File

@@ -0,0 +1,104 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2013, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := gcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O2 -malign-double -funroll-loops
CVECFLAGS := -msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into two groups: one for optimizable code, and
# one for code that should not be optimized.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS :=
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif

View File

@@ -77,7 +77,29 @@
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Default register blocksizes for inner kernel --
// -- Ccache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above.
// NOTE: These values are not yet used.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
@@ -104,6 +126,31 @@
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Number of elements per vector register --
// NOTE: These constants are typically only used to determine the amount

View File

@@ -175,7 +175,7 @@ void bli_packv_init_pack( pack_t pack_schema,
}
// Save the padded (packed) dimensions into the packed object.
bli_obj_set_packed_dims( m_p_pad, 1, *p );
bli_obj_set_padded_dims( m_p_pad, 1, *p );
// Grab the buffer address from the mem_t object and copy it to the
// main object buffer field. (Sometimes this buffer address will be
@@ -193,7 +193,7 @@ void bli_packv_init_pack( pack_t pack_schema,
// how much space beyond the vector would need to be zero-padded, if
// zero-padding was needed.
rs_p = 1;
cs_p = bli_obj_packed_length( *p );
cs_p = bli_obj_padded_length( *p );
bli_obj_set_incs( rs_p, cs_p, *p );
}

View File

@@ -48,7 +48,8 @@ typedef void (*FUNCPTR_T)(
dim_t n_max,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var2);
@@ -68,8 +69,8 @@ void bli_packm_blk_var2( obj_t* beta,
dim_t m_p = bli_obj_length( *p );
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_packed_length( *p );
dim_t n_max_p = bli_obj_packed_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
@@ -78,6 +79,7 @@ void bli_packm_blk_var2( obj_t* beta,
void* buf_p = bli_obj_buffer_at_off( *p );
inc_t rs_p = bli_obj_row_stride( *p );
inc_t cs_p = bli_obj_col_stride( *p );
dim_t pd_p = bli_obj_panel_dim( *p );
inc_t ps_p = bli_obj_panel_stride( *p );
void* buf_beta = bli_obj_scalar_buffer( dt_cp, *beta );
@@ -100,7 +102,8 @@ void bli_packm_blk_var2( obj_t* beta,
n_max_p,
buf_beta,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p, ps_p );
buf_p, rs_p, cs_p,
pd_p, ps_p );
}
@@ -119,7 +122,8 @@ void PASTEMAC(ch,varname )( \
dim_t n_max, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
) \
{ \
ctype* restrict beta_cast = beta; \
@@ -190,12 +194,12 @@ void PASTEMAC(ch,varname )( \
/* Prepare to pack to column panels. */ \
iter_dim = n; \
panel_len = m; \
panel_dim = rs_p; \
panel_dim = pd_p; \
incc = cs_c; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim; \
ldp = panel_dim; \
ldp = rs_p; \
m_panel = &m; \
n_panel = &panel_dim_i; \
m_panel_max = m_max; \
@@ -206,12 +210,12 @@ void PASTEMAC(ch,varname )( \
/* Prepare to pack to row panels. */ \
iter_dim = m; \
panel_len = n; \
panel_dim = cs_p; \
panel_dim = pd_p; \
incc = rs_c; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim; \
ldp = panel_dim; \
ldp = cs_p; \
m_panel = &panel_dim_i; \
n_panel = &n; \
m_panel_max = panel_dim; \
@@ -433,7 +437,7 @@ void PASTEMAC(ch,varname )( \
/*
if ( rs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: a copied", m_panel_max, n_panel_max, \
p_begin, 1, panel_dim, "%4.1f", "" ); \
p_begin, 1, cs_p, "%4.1f", "" ); \
if ( cs_p == 1 ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var2: b copied", m_panel_max, n_panel_max, \
p_begin, panel_dim, 1, "%6.3f", "" ); \

View File

@@ -52,7 +52,8 @@ void PASTEMAC(ch,varname)( \
dim_t n_max, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
);
INSERT_GENTPROT_BASIC( packm_blk_var2 )

View File

@@ -51,7 +51,8 @@ typedef void (*FUNCPTR_T)(
dim_t n_max,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
@@ -74,8 +75,8 @@ void bli_packm_blk_var3( obj_t* beta,
dim_t m_p = bli_obj_length( *p );
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_packed_length( *p );
dim_t n_max_p = bli_obj_packed_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );
@@ -84,6 +85,7 @@ void bli_packm_blk_var3( obj_t* beta,
void* buf_p = bli_obj_buffer_at_off( *p );
inc_t rs_p = bli_obj_row_stride( *p );
inc_t cs_p = bli_obj_col_stride( *p );
dim_t pd_p = bli_obj_panel_dim( *p );
inc_t ps_p = bli_obj_panel_stride( *p );
void* buf_beta = bli_obj_scalar_buffer( dt_cp, *beta );
@@ -109,7 +111,8 @@ void bli_packm_blk_var3( obj_t* beta,
n_max_p,
buf_beta,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p, ps_p );
buf_p, rs_p, cs_p,
pd_p, ps_p );
}
@@ -131,7 +134,8 @@ void PASTEMAC(ch,varname )( \
dim_t n_max, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
) \
{ \
ctype* restrict beta_cast = beta; \
@@ -159,7 +163,7 @@ void PASTEMAC(ch,varname )( \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t incc, ldc; \
inc_t p_inc; \
inc_t ldp, p_inc; \
dim_t* m_panel; \
dim_t* n_panel; \
dim_t m_panel_use; \
@@ -199,11 +203,12 @@ void PASTEMAC(ch,varname )( \
iter_dim = n; \
panel_len = m; \
panel_len_max = m_max; \
panel_dim = rs_p; \
panel_dim = pd_p; \
incc = cs_c; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim; \
ldp = rs_p; \
m_panel = &m; \
n_panel = &panel_dim_i; \
} \
@@ -213,11 +218,12 @@ void PASTEMAC(ch,varname )( \
iter_dim = m; \
panel_len = n; \
panel_len_max = n_max; \
panel_dim = cs_p; \
panel_dim = pd_p; \
incc = rs_c; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim; \
ldp = cs_p; \
m_panel = &panel_dim_i; \
n_panel = &n; \
} \
@@ -303,7 +309,7 @@ void PASTEMAC(ch,varname )( \
panel_len_i, \
beta_cast, \
c_use, incc, ldc, \
p_use, panel_dim ); \
p_use, ldp ); \
\
/* If the diagonal of C is implicitly unit, set the diagonal of
the packed panel to unit. */ \
@@ -351,7 +357,7 @@ void PASTEMAC(ch,varname )( \
p_use, rs_p, cs_p ); \
} \
\
p_inc = panel_dim * panel_len_max_i; \
p_inc = ldp * panel_len_max_i; \
} \
else \
{ \
@@ -369,9 +375,9 @@ void PASTEMAC(ch,varname )( \
panel_len_i, \
beta_cast, \
c_use, incc, ldc, \
p_use, panel_dim ); \
p_use, ldp ); \
\
p_inc = panel_dim * panel_len_max_i; \
p_inc = ldp * panel_len_max_i; \
} \
\
/* If necessary, zero-pad at the edge of the panel dimension (ie: the
@@ -382,7 +388,7 @@ void PASTEMAC(ch,varname )( \
dim_t m_edge = panel_dim - i; \
dim_t n_edge = panel_len_max_i; \
inc_t rs_pe = 1; \
inc_t cs_pe = panel_dim; \
inc_t cs_pe = ldp; \
ctype* p_edge = p_begin + (i )*rs_pe; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
@@ -402,7 +408,7 @@ void PASTEMAC(ch,varname )( \
dim_t m_edge = panel_dim; \
dim_t n_edge = panel_len_max_i - j; \
inc_t rs_pe = 1; \
inc_t cs_pe = panel_dim; \
inc_t cs_pe = ldp; \
ctype* p_edge = p_begin + (j )*cs_pe; \
\
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
@@ -427,7 +433,7 @@ void PASTEMAC(ch,varname )( \
dim_t m_br = panel_dim - i; \
dim_t n_br = panel_len_max_i - j; \
inc_t rs_pe = 1; \
inc_t cs_pe = panel_dim; \
inc_t cs_pe = ldp; \
ctype* p_edge = p_begin + (i )*rs_pe + (j )*cs_pe; \
\
PASTEMAC2(ch,ch,setd_unb_var1)( 0, \

View File

@@ -55,7 +55,8 @@ void PASTEMAC(ch,varname)( \
dim_t n_max, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
);
INSERT_GENTPROT_BASIC( packm_blk_var3 )

View File

@@ -50,6 +50,9 @@ packm_t* packm_cntl_scale;
blksz_t* packm_mult_ldim;
blksz_t* packm_mult_nvec;
blksz_t* packm_mult_mext;
blksz_t* packm_mult_next;
void bli_packm_cntl_init()
{
// Create blocksize objects for m and n register blocking. We will attach
@@ -70,6 +73,11 @@ void bli_packm_cntl_init()
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
// Create blocksize extensions that simply contain zero, as these
// fields are not used except by level-3 operations.
packm_mult_mext = bli_blksz_obj_create( 0, 0, 0, 0 );
packm_mult_next = bli_blksz_obj_create( 0, 0, 0, 0 );
// Generally speaking, the BLIS_PACKED_ROWS and BLIS_PACKED_COLUMNS
// are used by the level-2 operations, and thus densification is not
// necessary. These schemas amount to simple copies to row or column
@@ -89,7 +97,9 @@ void bli_packm_cntl_init()
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to rows:
packm_mult_nvec, // - nvec multiple is used for m dimension
packm_mult_mext, // - m extension is zero / unused
packm_mult_ldim, // - ldim multiple is used for n dimension
packm_mult_next, // - n extension is zero / unused
FALSE, // do NOT scale
FALSE, // do NOT densify structure
FALSE, // do NOT invert diagonal
@@ -102,7 +112,9 @@ void bli_packm_cntl_init()
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to rows:
packm_mult_nvec, // - nvec multiple is used for m dimension
packm_mult_mext, // - m extension is zero / unused
packm_mult_ldim, // - ldim multiple is used for n dimension
packm_mult_next, // - n extension is zero / unused
TRUE, // do scale
FALSE, // do NOT densify structure
FALSE, // do NOT invert diagonal
@@ -118,7 +130,9 @@ void bli_packm_cntl_init()
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to columns:
packm_mult_ldim, // - ldim multiple is used for m dimension
packm_mult_mext, // - m extension is zero / unused
packm_mult_nvec, // - nvec multiple is used for n dimension
packm_mult_next, // - n extension is zero / unused
FALSE, // do NOT scale
FALSE, // do NOT densify structure
FALSE, // do NOT invert diagonal
@@ -131,7 +145,9 @@ void bli_packm_cntl_init()
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to columns:
packm_mult_ldim, // - ldim multiple is used for m dimension
packm_mult_mext, // - m extension is zero / unused
packm_mult_nvec, // - nvec multiple is used for n dimension
packm_mult_next, // - n extension is zero / unused
TRUE, // do scale
FALSE, // do NOT densify structure
FALSE, // do NOT invert diagonal
@@ -141,64 +157,6 @@ void bli_packm_cntl_init()
BLIS_BUFFER_FOR_GEN_USE );
// Create control trees to pack by row panels (with and without scaling).
packm_cntl_rpn_noscale
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to row panels:
packm_mult_nvec, // - nvec multiple is used for panel length
packm_mult_ldim, // - ldim multiple is used for panel width
FALSE, // do NOT scale
TRUE, // densify structure
FALSE, // do NOT invert diagonal
FALSE, // do NOT iterate backwards if upper
FALSE, // do NOT iterate backwards if lower
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_GEN_USE );
packm_cntl_rpn_scale
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to row panels:
packm_mult_nvec, // - nvec multiple is used for panel length
packm_mult_ldim, // - ldim multiple is used for panel width
TRUE, // do scale
TRUE, // densify structure
FALSE, // do NOT invert diagonal
FALSE, // do NOT iterate backwards if upper
FALSE, // do NOT iterate backwards if lower
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_GEN_USE );
// Create control trees to pack by column panels (with and without scaling).
packm_cntl_cpn_noscale
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to column panels:
packm_mult_ldim, // - ldim multiple is used for panel length
packm_mult_nvec, // - nvec multiple is used for panel width
FALSE, // do NOT scale
TRUE, // densify structure
FALSE, // do NOT invert diagonal
FALSE, // do NOT iterate backwards if upper
FALSE, // do NOT iterate backwards if lower
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_GEN_USE );
packm_cntl_cpn_scale
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1, // When packing to column panels:
packm_mult_ldim, // - ldim multiple is used for panel length
packm_mult_nvec, // - nvec multiple is used for panel width
TRUE, // do scale
TRUE, // densify structure
FALSE, // do NOT invert diagonal
FALSE, // do NOT iterate backwards if upper
FALSE, // do NOT iterate backwards if lower
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_GEN_USE );
// Set defaults when we don't care whether the packing is by rows or
// by columns.
packm_cntl_noscale = packm_cntl_col_noscale;
@@ -212,19 +170,16 @@ void bli_packm_cntl_finalize()
bli_cntl_obj_free( packm_cntl_col_noscale );
bli_cntl_obj_free( packm_cntl_col_scale );
bli_cntl_obj_free( packm_cntl_rpn_noscale );
bli_cntl_obj_free( packm_cntl_rpn_scale );
bli_cntl_obj_free( packm_cntl_cpn_noscale );
bli_cntl_obj_free( packm_cntl_cpn_scale );
bli_blksz_obj_free( packm_mult_ldim );
bli_blksz_obj_free( packm_mult_nvec );
}
packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
blksz_t* mult_m,
blksz_t* mult_n,
blksz_t* mr_def,
blksz_t* mr_ext,
blksz_t* nr_def,
blksz_t* nr_ext,
bool_t does_scale,
bool_t does_densify,
bool_t does_invert_diag,
@@ -239,8 +194,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->mult_m = mult_m;
cntl->mult_n = mult_n;
cntl->mr_def = mr_def;
cntl->mr_ext = mr_ext;
cntl->nr_def = nr_def;
cntl->nr_ext = nr_ext;
cntl->does_scale = does_scale;
cntl->does_densify = does_densify;
cntl->does_invert_diag = does_invert_diag;
@@ -255,8 +212,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
void bli_packm_cntl_obj_init( packm_t* cntl,
impl_t impl_type,
varnum_t var_num,
blksz_t* mult_m,
blksz_t* mult_n,
blksz_t* mr_def,
blksz_t* mr_ext,
blksz_t* nr_def,
blksz_t* nr_ext,
bool_t does_scale,
bool_t does_densify,
bool_t does_invert_diag,
@@ -267,8 +226,10 @@ void bli_packm_cntl_obj_init( packm_t* cntl,
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->mult_m = mult_m;
cntl->mult_n = mult_n;
cntl->mr_def = mr_def;
cntl->mr_ext = mr_ext;
cntl->nr_def = nr_def;
cntl->nr_ext = nr_ext;
cntl->does_scale = does_scale;
cntl->does_densify = does_densify;
cntl->does_invert_diag = does_invert_diag;

View File

@@ -36,8 +36,10 @@ struct packm_s
{
impl_t impl_type;
varnum_t var_num;
blksz_t* mult_m;
blksz_t* mult_n;
blksz_t* mr_def;
blksz_t* mr_ext;
blksz_t* nr_def;
blksz_t* nr_ext;
bool_t does_scale;
bool_t does_densify;
bool_t does_invert_diag;
@@ -48,8 +50,10 @@ struct packm_s
};
typedef struct packm_s packm_t;
#define cntl_mult_m( cntl ) cntl->mult_m
#define cntl_mult_n( cntl ) cntl->mult_n
#define cntl_mr_def( cntl ) cntl->mr_def
#define cntl_mr_ext( cntl ) cntl->mr_ext
#define cntl_nr_def( cntl ) cntl->nr_def
#define cntl_nr_ext( cntl ) cntl->nr_ext
#define cntl_does_scale( cntl ) cntl->does_scale
#define cntl_does_densify( cntl ) cntl->does_densify
@@ -71,8 +75,10 @@ void bli_packm_cntl_init( void );
void bli_packm_cntl_finalize( void );
packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
blksz_t* mult_m,
blksz_t* mult_n,
blksz_t* mr_def,
blksz_t* mr_ext,
blksz_t* nr_def,
blksz_t* nr_ext,
bool_t does_scale,
bool_t does_densify,
bool_t does_invert_diag,
@@ -83,8 +89,10 @@ packm_t* bli_packm_cntl_obj_create( impl_t impl_type,
void bli_packm_cntl_obj_init( packm_t* cntl,
impl_t impl_type,
varnum_t var_num,
blksz_t* mult_m,
blksz_t* mult_n,
blksz_t* mr_def,
blksz_t* mr_ext,
blksz_t* nr_def,
blksz_t* nr_ext,
bool_t does_scale,
bool_t does_densify,
bool_t does_invert_diag,

View File

@@ -50,8 +50,10 @@ void bli_packm_init( obj_t* a,
packord_t pack_ord_if_up;
packord_t pack_ord_if_lo;
packbuf_t pack_buf_type;
blksz_t* mult_m;
blksz_t* mult_n;
blksz_t* mr_def;
blksz_t* mr_ext;
blksz_t* nr_def;
blksz_t* nr_ext;
obj_t c;
// Check parameters.
@@ -126,8 +128,10 @@ void bli_packm_init( obj_t* a,
needs_densify = cntl_does_densify( cntl );
pack_schema = cntl_pack_schema( cntl );
pack_buf_type = cntl_pack_buf_type( cntl );
mult_m = cntl_mult_m( cntl );
mult_n = cntl_mult_n( cntl );
mr_def = cntl_mr_def( cntl );
mr_ext = cntl_mr_ext( cntl );
nr_def = cntl_nr_def( cntl );
nr_ext = cntl_nr_ext( cntl );
if ( cntl_does_invert_diag( cntl ) ) invert_diag = BLIS_INVERT_DIAG;
else invert_diag = BLIS_NO_INVERT_DIAG;
@@ -145,8 +149,8 @@ void bli_packm_init( obj_t* a,
pack_ord_if_up,
pack_ord_if_lo,
pack_buf_type,
mult_m,
mult_n,
mr_def, mr_ext,
nr_def, nr_ext,
&c,
p );
@@ -160,8 +164,10 @@ void bli_packm_init_pack( bool_t densify,
packord_t pack_ord_if_up,
packord_t pack_ord_if_lo,
packbuf_t pack_buf_type,
blksz_t* mult_m,
blksz_t* mult_n,
blksz_t* mr_def,
blksz_t* mr_ext,
blksz_t* nr_def,
blksz_t* nr_ext,
obj_t* c,
obj_t* p )
{
@@ -169,8 +175,13 @@ void bli_packm_init_pack( bool_t densify,
trans_t transc = bli_obj_trans_status( *c );
dim_t m_c = bli_obj_length( *c );
dim_t n_c = bli_obj_width( *c );
dim_t mult_m_dim = bli_blksz_for_type( datatype, mult_m );
dim_t mult_n_dim = bli_blksz_for_type( datatype, mult_n );
dim_t mr_def_dim = bli_blksz_for_type( datatype, mr_def );
dim_t mr_ext_dim = bli_blksz_for_type( datatype, mr_ext );
dim_t nr_def_dim = bli_blksz_for_type( datatype, nr_def );
dim_t nr_ext_dim = bli_blksz_for_type( datatype, nr_ext );
dim_t mr_pack_dim = mr_def_dim + mr_ext_dim;
dim_t nr_pack_dim = nr_def_dim + nr_ext_dim;
mem_t* mem_p;
dim_t m_p_pad, n_p_pad;
@@ -227,13 +238,13 @@ void bli_packm_init_pack( bool_t densify,
// in p) and aligning them to the dimension multiples (typically equal
// to register blocksizes). This does waste a little bit of space for
// level-2 operations, but that's okay with us.
m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mult_m_dim );
n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), mult_n_dim );
m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mr_def_dim );
n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), nr_def_dim );
// Save the padded dimensions into the packed object. It is important
// to save these dimensions since they represent the actual dimensions
// of the zero-padded matrix.
bli_obj_set_packed_dims( m_p_pad, n_p_pad, *p );
bli_obj_set_padded_dims( m_p_pad, n_p_pad, *p );
// Now we prepare to compute strides, align them, and compute the
// total number of bytes needed for the packed buffer. After that,
@@ -294,13 +305,13 @@ void bli_packm_init_pack( bool_t densify,
dim_t ps_p;
// The maximum panel length (for each datatype) should be equal to
// the m dimension multiple.
m_panel = mult_m_dim;
// the register blocksize in the m dimension.
m_panel = mr_def_dim;
// The "column stride" of a row panel packed object is interpreted as
// the column stride WITHIN a panel. Thus, this is equal to the panel
// length.
cs_p = m_panel;
// dimension plus an extension (which may be zero).
cs_p = mr_pack_dim;
// The "row stride" of a row panel packed object is interpreted
// as the row stride WITHIN a panel. Thus, it is unit.
@@ -319,8 +330,9 @@ void bli_packm_init_pack( bool_t densify,
ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
BLIS_CONTIG_STRIDE_ALIGN_SIZE );
// Store the strides in p.
// Store the strides and panel dimension in p.
bli_obj_set_incs( rs_p, cs_p, *p );
bli_obj_set_panel_dim( m_panel, *p );
bli_obj_set_panel_stride( ps_p, *p );
// Compute the size of the packed buffer.
@@ -332,13 +344,13 @@ void bli_packm_init_pack( bool_t densify,
dim_t ps_p;
// The maximum panel width (for each datatype) should be equal to
// the n dimension multiple.
n_panel = mult_n_dim;
// the register blocksize in the n dimension.
n_panel = nr_def_dim;
// The "row stride" of a column panel packed object is interpreted as
// the row stride WITHIN a panel. Thus, it is equal to the panel
// width.
rs_p = n_panel;
// the row stride WITHIN a panel. Thus, this is equal to the panel
// dimension plus an extension (which may be zero).
rs_p = nr_pack_dim;
// The "column stride" of a column panel packed object is interpreted
// as the column stride WITHIN a panel. Thus, it is unit.
@@ -357,8 +369,9 @@ void bli_packm_init_pack( bool_t densify,
ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
BLIS_CONTIG_STRIDE_ALIGN_SIZE );
// Store the strides in p.
// Store the strides and panel dimension in p.
bli_obj_set_incs( rs_p, cs_p, *p );
bli_obj_set_panel_dim( n_panel, *p );
bli_obj_set_panel_stride( ps_p, *p );
// Compute the size of the packed buffer.

View File

@@ -42,8 +42,10 @@ void bli_packm_init_pack( bool_t densify,
packord_t pack_ord_if_up,
packord_t pack_ord_if_lo,
packbuf_t pack_buf_type,
blksz_t* mult_m,
blksz_t* mult_n,
blksz_t* mr_def,
blksz_t* mr_ext,
blksz_t* nr_def,
blksz_t* nr_ext,
obj_t* c,
obj_t* p );

View File

@@ -76,7 +76,7 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
// Modify offsets and dimensions of requested partition.
bli_obj_set_dims( b, n, *sub_obj );
// Tweak the packed length of the subpartition to trick the underlying
// Tweak the padded length of the subpartition to trick the underlying
// implementation into only zero-padding for the narrow submatrix of
// interest. Usually, the value we want is b (for non-edge cases), but
// at the edges, we want the remainder of the mem_t region in the m
@@ -86,13 +86,13 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
// b for the edge iteration). In these cases, we arrive at the new
// packed length by simply subtracting off i.
{
dim_t m_pack_max = bli_obj_packed_length( *sub_obj );
dim_t m_pack_max = bli_obj_padded_length( *sub_obj );
dim_t m_pack_cur;
if ( i + b == m ) m_pack_cur = m_pack_max - i;
else m_pack_cur = b;
bli_obj_set_packed_length( m_pack_cur, *sub_obj );
bli_obj_set_padded_length( m_pack_cur, *sub_obj );
}
// Translate the desired offsets to a panel offset and adjust the
@@ -152,7 +152,7 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
// Modify offsets and dimensions of requested partition.
bli_obj_set_dims( m, b, *sub_obj );
// Tweak the packed width of the subpartition to trick the underlying
// Tweak the padded width of the subpartition to trick the underlying
// implementation into only zero-padding for the narrow submatrix of
// interest. Usually, the value we want is b (for non-edge cases), but
// at the edges, we want the remainder of the mem_t region in the n
@@ -162,13 +162,13 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
// b for the edge iteration). In these cases, we arrive at the new
// packed width by simply subtracting off j.
{
dim_t n_pack_max = bli_obj_packed_width( *sub_obj );
dim_t n_pack_max = bli_obj_padded_width( *sub_obj );
dim_t n_pack_cur;
if ( j + b == n ) n_pack_cur = n_pack_max - j;
else n_pack_cur = b;
bli_obj_set_packed_width( n_pack_cur, *sub_obj );
bli_obj_set_padded_width( n_pack_cur, *sub_obj );
}
// Translate the desired offsets to a panel offset and adjust the

View File

@@ -70,8 +70,8 @@ void bli_packm_unb_var1( obj_t* beta,
dim_t m_p = bli_obj_length( *p );
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_packed_length( *p );
dim_t n_max_p = bli_obj_packed_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );

View File

@@ -71,8 +71,8 @@ void bli_packm_blk_var1( obj_t* beta,
dim_t m_p = bli_obj_length( *p );
dim_t n_p = bli_obj_width( *p );
dim_t m_max_p = bli_obj_packed_length( *p );
dim_t n_max_p = bli_obj_packed_width( *p );
dim_t m_max_p = bli_obj_padded_length( *p );
dim_t n_max_p = bli_obj_padded_width( *p );
void* buf_c = bli_obj_buffer_at_off( *c );
inc_t rs_c = bli_obj_row_stride( *c );

View File

@@ -54,87 +54,64 @@ blksz_t* gemm_kc;
blksz_t* gemm_mr;
blksz_t* gemm_nr;
blksz_t* gemm_kr;
blksz_t* gemm_extmr;
blksz_t* gemm_extnr;
blksz_t* gemm_extkr;
blksz_t* gemm_ni;
// Cache blocksizes.
#define BLIS_GEMM_KC_S BLIS_DEFAULT_KC_S
#define BLIS_GEMM_KC_D BLIS_DEFAULT_KC_D
#define BLIS_GEMM_KC_C BLIS_DEFAULT_KC_C
#define BLIS_GEMM_KC_Z BLIS_DEFAULT_KC_Z
#define BLIS_GEMM_MC_S BLIS_DEFAULT_MC_S
#define BLIS_GEMM_MC_D BLIS_DEFAULT_MC_D
#define BLIS_GEMM_MC_C BLIS_DEFAULT_MC_C
#define BLIS_GEMM_MC_Z BLIS_DEFAULT_MC_Z
#define BLIS_GEMM_NC_S BLIS_DEFAULT_NC_S
#define BLIS_GEMM_NC_D BLIS_DEFAULT_NC_D
#define BLIS_GEMM_NC_C BLIS_DEFAULT_NC_C
#define BLIS_GEMM_NC_Z BLIS_DEFAULT_NC_Z
// Register blocking
#define BLIS_GEMM_KR_S BLIS_DEFAULT_KR_S
#define BLIS_GEMM_KR_D BLIS_DEFAULT_KR_D
#define BLIS_GEMM_KR_C BLIS_DEFAULT_KR_C
#define BLIS_GEMM_KR_Z BLIS_DEFAULT_KR_Z
#define BLIS_GEMM_MR_S BLIS_DEFAULT_MR_S
#define BLIS_GEMM_MR_D BLIS_DEFAULT_MR_D
#define BLIS_GEMM_MR_C BLIS_DEFAULT_MR_C
#define BLIS_GEMM_MR_Z BLIS_DEFAULT_MR_Z
#define BLIS_GEMM_NR_S BLIS_DEFAULT_NR_S
#define BLIS_GEMM_NR_D BLIS_DEFAULT_NR_D
#define BLIS_GEMM_NR_C BLIS_DEFAULT_NR_C
#define BLIS_GEMM_NR_Z BLIS_DEFAULT_NR_Z
// Incremental pack blocking
#define BLIS_GEMM_NI_S BLIS_DEFAULT_NI_S
#define BLIS_GEMM_NI_D BLIS_DEFAULT_NI_D
#define BLIS_GEMM_NI_C BLIS_DEFAULT_NI_C
#define BLIS_GEMM_NI_Z BLIS_DEFAULT_NI_Z
void bli_gemm_cntl_init()
{
// Create blocksize objects for each dimension.
gemm_mc = bli_blksz_obj_create( BLIS_GEMM_MC_S,
BLIS_GEMM_MC_D,
BLIS_GEMM_MC_C,
BLIS_GEMM_MC_Z );
gemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
BLIS_DEFAULT_MC_D,
BLIS_DEFAULT_MC_C,
BLIS_DEFAULT_MC_Z );
gemm_nc = bli_blksz_obj_create( BLIS_GEMM_NC_S,
BLIS_GEMM_NC_D,
BLIS_GEMM_NC_C,
BLIS_GEMM_NC_Z );
gemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
BLIS_DEFAULT_NC_D,
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
gemm_kc = bli_blksz_obj_create( BLIS_GEMM_KC_S,
BLIS_GEMM_KC_D,
BLIS_GEMM_KC_C,
BLIS_GEMM_KC_Z );
gemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
BLIS_DEFAULT_KC_D,
BLIS_DEFAULT_KC_C,
BLIS_DEFAULT_KC_Z );
gemm_mr = bli_blksz_obj_create( BLIS_GEMM_MR_S,
BLIS_GEMM_MR_D,
BLIS_GEMM_MR_C,
BLIS_GEMM_MR_Z );
gemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
BLIS_DEFAULT_MR_C,
BLIS_DEFAULT_MR_Z );
gemm_nr = bli_blksz_obj_create( BLIS_GEMM_NR_S,
BLIS_GEMM_NR_D,
BLIS_GEMM_NR_C,
BLIS_GEMM_NR_Z );
gemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
BLIS_DEFAULT_NR_D,
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
gemm_kr = bli_blksz_obj_create( BLIS_GEMM_KR_S,
BLIS_GEMM_KR_D,
BLIS_GEMM_KR_C,
BLIS_GEMM_KR_Z );
gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
BLIS_DEFAULT_KR_D,
BLIS_DEFAULT_KR_C,
BLIS_DEFAULT_KR_Z );
gemm_ni = bli_blksz_obj_create( BLIS_GEMM_NI_S,
BLIS_GEMM_NI_D,
BLIS_GEMM_NI_C,
BLIS_GEMM_NI_Z );
gemm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
gemm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
gemm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
BLIS_EXTEND_KR_D,
BLIS_EXTEND_KR_C,
BLIS_EXTEND_KR_Z );
gemm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
BLIS_DEFAULT_NI_D,
BLIS_DEFAULT_NI_C,
BLIS_DEFAULT_NI_Z );
// Create control tree objects for packm operations on a, b, and c.
@@ -142,8 +119,8 @@ void bli_gemm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
gemm_mr,
gemm_kr,
gemm_mr, gemm_extmr,
gemm_kr, gemm_extkr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -156,8 +133,8 @@ void bli_gemm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
gemm_kr,
gemm_nr,
gemm_kr, gemm_extkr,
gemm_nr, gemm_extnr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -170,8 +147,8 @@ void bli_gemm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
gemm_mr,
gemm_nr,
gemm_mr, gemm_extmr,
gemm_nr, gemm_extnr,
FALSE, // do NOT scale by beta
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal

View File

@@ -144,7 +144,7 @@ void PASTEMAC(ch,varname)( \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,kc) * \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
@@ -239,10 +239,15 @@ void PASTEMAC(ch,varname)( \
columns of B to a local buffer with each value duplicated. */ \
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
else bp = b1; \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
\
/* Invoke the gemm micro-kernel. */ \
PASTEMAC(ch,ukrname)( k, \
alpha_cast, \

View File

@@ -47,22 +47,20 @@ void PASTEMAC(ch,varname)( \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t m = PASTEMAC(ch,mr); \
const dim_t n = PASTEMAC(ch,nr); \
\
const dim_t m = MR; \
const dim_t n = NR; \
const inc_t cs_a = PASTEMAC(ch,packmr); \
\
const inc_t cs_a = MR; \
\
const inc_t rs_b = NR; \
const inc_t rs_b = PASTEMAC(ch,packnr); \
\
const inc_t rs_ab = 1; \
const inc_t cs_ab = MR; \
const inc_t cs_ab = PASTEMAC(ch,mr); \
\
dim_t k0, j0, i0; \
\
ctype ab[ MR * NR ]; \
ctype ab[ PASTEMAC(ch,mr) * \
PASTEMAC(ch,nr) ]; \
ctype* restrict ab00; \
ctype a0; \
ctype b0; \

View File

@@ -54,87 +54,64 @@ blksz_t* hemm_kc;
blksz_t* hemm_mr;
blksz_t* hemm_nr;
blksz_t* hemm_kr;
blksz_t* hemm_extmr;
blksz_t* hemm_extnr;
blksz_t* hemm_extkr;
blksz_t* hemm_ni;
// Cache blocksizes.
#define BLIS_HEMM_KC_S BLIS_DEFAULT_KC_S
#define BLIS_HEMM_KC_D BLIS_DEFAULT_KC_D
#define BLIS_HEMM_KC_C BLIS_DEFAULT_KC_C
#define BLIS_HEMM_KC_Z BLIS_DEFAULT_KC_Z
#define BLIS_HEMM_MC_S BLIS_DEFAULT_MC_S
#define BLIS_HEMM_MC_D BLIS_DEFAULT_MC_D
#define BLIS_HEMM_MC_C BLIS_DEFAULT_MC_C
#define BLIS_HEMM_MC_Z BLIS_DEFAULT_MC_Z
#define BLIS_HEMM_NC_S BLIS_DEFAULT_NC_S
#define BLIS_HEMM_NC_D BLIS_DEFAULT_NC_D
#define BLIS_HEMM_NC_C BLIS_DEFAULT_NC_C
#define BLIS_HEMM_NC_Z BLIS_DEFAULT_NC_Z
// Register blocking
#define BLIS_HEMM_KR_S BLIS_DEFAULT_KR_S
#define BLIS_HEMM_KR_D BLIS_DEFAULT_KR_D
#define BLIS_HEMM_KR_C BLIS_DEFAULT_KR_C
#define BLIS_HEMM_KR_Z BLIS_DEFAULT_KR_Z
#define BLIS_HEMM_MR_S BLIS_DEFAULT_MR_S
#define BLIS_HEMM_MR_D BLIS_DEFAULT_MR_D
#define BLIS_HEMM_MR_C BLIS_DEFAULT_MR_C
#define BLIS_HEMM_MR_Z BLIS_DEFAULT_MR_Z
#define BLIS_HEMM_NR_S BLIS_DEFAULT_NR_S
#define BLIS_HEMM_NR_D BLIS_DEFAULT_NR_D
#define BLIS_HEMM_NR_C BLIS_DEFAULT_NR_C
#define BLIS_HEMM_NR_Z BLIS_DEFAULT_NR_Z
// Incremental pack blocking
#define BLIS_HEMM_NI_S BLIS_DEFAULT_NI_S
#define BLIS_HEMM_NI_D BLIS_DEFAULT_NI_D
#define BLIS_HEMM_NI_C BLIS_DEFAULT_NI_C
#define BLIS_HEMM_NI_Z BLIS_DEFAULT_NI_Z
void bli_hemm_cntl_init()
{
// Create blocksize objects for each dimension.
hemm_mc = bli_blksz_obj_create( BLIS_HEMM_MC_S,
BLIS_HEMM_MC_D,
BLIS_HEMM_MC_C,
BLIS_HEMM_MC_Z );
hemm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
BLIS_DEFAULT_MC_D,
BLIS_DEFAULT_MC_C,
BLIS_DEFAULT_MC_Z );
hemm_nc = bli_blksz_obj_create( BLIS_HEMM_NC_S,
BLIS_HEMM_NC_D,
BLIS_HEMM_NC_C,
BLIS_HEMM_NC_Z );
hemm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
BLIS_DEFAULT_NC_D,
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
hemm_kc = bli_blksz_obj_create( BLIS_HEMM_KC_S,
BLIS_HEMM_KC_D,
BLIS_HEMM_KC_C,
BLIS_HEMM_KC_Z );
hemm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
BLIS_DEFAULT_KC_D,
BLIS_DEFAULT_KC_C,
BLIS_DEFAULT_KC_Z );
hemm_mr = bli_blksz_obj_create( BLIS_HEMM_MR_S,
BLIS_HEMM_MR_D,
BLIS_HEMM_MR_C,
BLIS_HEMM_MR_Z );
hemm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
BLIS_DEFAULT_MR_C,
BLIS_DEFAULT_MR_Z );
hemm_nr = bli_blksz_obj_create( BLIS_HEMM_NR_S,
BLIS_HEMM_NR_D,
BLIS_HEMM_NR_C,
BLIS_HEMM_NR_Z );
hemm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
BLIS_DEFAULT_NR_D,
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
hemm_kr = bli_blksz_obj_create( BLIS_HEMM_KR_S,
BLIS_HEMM_KR_D,
BLIS_HEMM_KR_C,
BLIS_HEMM_KR_Z );
hemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
BLIS_DEFAULT_KR_D,
BLIS_DEFAULT_KR_C,
BLIS_DEFAULT_KR_Z );
hemm_ni = bli_blksz_obj_create( BLIS_HEMM_NI_S,
BLIS_HEMM_NI_D,
BLIS_HEMM_NI_C,
BLIS_HEMM_NI_Z );
hemm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
hemm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
hemm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
BLIS_EXTEND_KR_D,
BLIS_EXTEND_KR_C,
BLIS_EXTEND_KR_Z );
hemm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
BLIS_DEFAULT_NI_D,
BLIS_DEFAULT_NI_C,
BLIS_DEFAULT_NI_Z );
// Create control tree objects for packm operations on a, b, and c.
@@ -142,8 +119,8 @@ void bli_hemm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
hemm_mr,
hemm_kr,
hemm_mr, hemm_extmr,
hemm_kr, hemm_extkr,
FALSE, // do NOT scale by alpha
TRUE, // densify
FALSE, // do NOT invert diagonal
@@ -156,8 +133,8 @@ void bli_hemm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
hemm_kr,
hemm_nr,
hemm_kr, hemm_extkr,
hemm_nr, hemm_extnr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -170,8 +147,8 @@ void bli_hemm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
hemm_mr,
hemm_nr,
hemm_mr, hemm_extmr,
hemm_nr, hemm_extnr,
FALSE, // do NOT scale by beta
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal

View File

@@ -55,87 +55,64 @@ blksz_t* her2k_kc;
blksz_t* her2k_mr;
blksz_t* her2k_nr;
blksz_t* her2k_kr;
blksz_t* her2k_extmr;
blksz_t* her2k_extnr;
blksz_t* her2k_extkr;
blksz_t* her2k_ni;
// Cache blocksizes.
#define BLIS_HER2K_KC_S BLIS_DEFAULT_KC_S
#define BLIS_HER2K_KC_D BLIS_DEFAULT_KC_D
#define BLIS_HER2K_KC_C BLIS_DEFAULT_KC_C
#define BLIS_HER2K_KC_Z BLIS_DEFAULT_KC_Z
#define BLIS_HER2K_MC_S BLIS_DEFAULT_MC_S
#define BLIS_HER2K_MC_D BLIS_DEFAULT_MC_D
#define BLIS_HER2K_MC_C BLIS_DEFAULT_MC_C
#define BLIS_HER2K_MC_Z BLIS_DEFAULT_MC_Z
#define BLIS_HER2K_NC_S BLIS_DEFAULT_NC_S
#define BLIS_HER2K_NC_D BLIS_DEFAULT_NC_D
#define BLIS_HER2K_NC_C BLIS_DEFAULT_NC_C
#define BLIS_HER2K_NC_Z BLIS_DEFAULT_NC_Z
// Register blocking
#define BLIS_HER2K_KR_S BLIS_DEFAULT_KR_S
#define BLIS_HER2K_KR_D BLIS_DEFAULT_KR_D
#define BLIS_HER2K_KR_C BLIS_DEFAULT_KR_C
#define BLIS_HER2K_KR_Z BLIS_DEFAULT_KR_Z
#define BLIS_HER2K_MR_S BLIS_DEFAULT_MR_S
#define BLIS_HER2K_MR_D BLIS_DEFAULT_MR_D
#define BLIS_HER2K_MR_C BLIS_DEFAULT_MR_C
#define BLIS_HER2K_MR_Z BLIS_DEFAULT_MR_Z
#define BLIS_HER2K_NR_S BLIS_DEFAULT_NR_S
#define BLIS_HER2K_NR_D BLIS_DEFAULT_NR_D
#define BLIS_HER2K_NR_C BLIS_DEFAULT_NR_C
#define BLIS_HER2K_NR_Z BLIS_DEFAULT_NR_Z
// Incremental pack blocking
#define BLIS_HER2K_NI_S BLIS_DEFAULT_NI_S
#define BLIS_HER2K_NI_D BLIS_DEFAULT_NI_D
#define BLIS_HER2K_NI_C BLIS_DEFAULT_NI_C
#define BLIS_HER2K_NI_Z BLIS_DEFAULT_NI_Z
void bli_her2k_cntl_init()
{
// Create blocksize objects for each dimension.
her2k_mc = bli_blksz_obj_create( BLIS_HER2K_MC_S,
BLIS_HER2K_MC_D,
BLIS_HER2K_MC_C,
BLIS_HER2K_MC_Z );
her2k_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
BLIS_DEFAULT_MC_D,
BLIS_DEFAULT_MC_C,
BLIS_DEFAULT_MC_Z );
her2k_nc = bli_blksz_obj_create( BLIS_HER2K_NC_S,
BLIS_HER2K_NC_D,
BLIS_HER2K_NC_C,
BLIS_HER2K_NC_Z );
her2k_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
BLIS_DEFAULT_NC_D,
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
her2k_kc = bli_blksz_obj_create( BLIS_HER2K_KC_S,
BLIS_HER2K_KC_D,
BLIS_HER2K_KC_C,
BLIS_HER2K_KC_Z );
her2k_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
BLIS_DEFAULT_KC_D,
BLIS_DEFAULT_KC_C,
BLIS_DEFAULT_KC_Z );
her2k_mr = bli_blksz_obj_create( BLIS_HER2K_MR_S,
BLIS_HER2K_MR_D,
BLIS_HER2K_MR_C,
BLIS_HER2K_MR_Z );
her2k_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
BLIS_DEFAULT_MR_C,
BLIS_DEFAULT_MR_Z );
her2k_nr = bli_blksz_obj_create( BLIS_HER2K_NR_S,
BLIS_HER2K_NR_D,
BLIS_HER2K_NR_C,
BLIS_HER2K_NR_Z );
her2k_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
BLIS_DEFAULT_NR_D,
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
her2k_kr = bli_blksz_obj_create( BLIS_HER2K_KR_S,
BLIS_HER2K_KR_D,
BLIS_HER2K_KR_C,
BLIS_HER2K_KR_Z );
her2k_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
BLIS_DEFAULT_KR_D,
BLIS_DEFAULT_KR_C,
BLIS_DEFAULT_KR_Z );
her2k_ni = bli_blksz_obj_create( BLIS_HER2K_NI_S,
BLIS_HER2K_NI_D,
BLIS_HER2K_NI_C,
BLIS_HER2K_NI_Z );
her2k_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
her2k_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
her2k_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
BLIS_EXTEND_KR_D,
BLIS_EXTEND_KR_C,
BLIS_EXTEND_KR_Z );
her2k_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
BLIS_DEFAULT_NI_D,
BLIS_DEFAULT_NI_C,
BLIS_DEFAULT_NI_Z );
// Create control tree objects for packm operations on a, b, and c.
@@ -143,8 +120,8 @@ void bli_her2k_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
her2k_mr,
her2k_kr,
her2k_mr, her2k_extmr,
her2k_kr, her2k_extkr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -157,8 +134,8 @@ void bli_her2k_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
her2k_kr,
her2k_nr,
her2k_kr, her2k_extkr,
her2k_nr, her2k_extnr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -171,8 +148,8 @@ void bli_her2k_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
her2k_mr,
her2k_nr,
her2k_mr, her2k_extmr,
her2k_nr, her2k_extnr,
FALSE, // do NOT scale by beta
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal

View File

@@ -54,87 +54,64 @@ blksz_t* herk_kc;
blksz_t* herk_mr;
blksz_t* herk_nr;
blksz_t* herk_kr;
blksz_t* herk_extmr;
blksz_t* herk_extnr;
blksz_t* herk_extkr;
blksz_t* herk_ni;
// Cache blocksizes.
#define BLIS_HERK_KC_S BLIS_DEFAULT_KC_S
#define BLIS_HERK_KC_D BLIS_DEFAULT_KC_D
#define BLIS_HERK_KC_C BLIS_DEFAULT_KC_C
#define BLIS_HERK_KC_Z BLIS_DEFAULT_KC_Z
#define BLIS_HERK_MC_S BLIS_DEFAULT_MC_S
#define BLIS_HERK_MC_D BLIS_DEFAULT_MC_D
#define BLIS_HERK_MC_C BLIS_DEFAULT_MC_C
#define BLIS_HERK_MC_Z BLIS_DEFAULT_MC_Z
#define BLIS_HERK_NC_S BLIS_DEFAULT_NC_S
#define BLIS_HERK_NC_D BLIS_DEFAULT_NC_D
#define BLIS_HERK_NC_C BLIS_DEFAULT_NC_C
#define BLIS_HERK_NC_Z BLIS_DEFAULT_NC_Z
// Register blocking
#define BLIS_HERK_KR_S BLIS_DEFAULT_KR_S
#define BLIS_HERK_KR_D BLIS_DEFAULT_KR_D
#define BLIS_HERK_KR_C BLIS_DEFAULT_KR_C
#define BLIS_HERK_KR_Z BLIS_DEFAULT_KR_Z
#define BLIS_HERK_MR_S BLIS_DEFAULT_MR_S
#define BLIS_HERK_MR_D BLIS_DEFAULT_MR_D
#define BLIS_HERK_MR_C BLIS_DEFAULT_MR_C
#define BLIS_HERK_MR_Z BLIS_DEFAULT_MR_Z
#define BLIS_HERK_NR_S BLIS_DEFAULT_NR_S
#define BLIS_HERK_NR_D BLIS_DEFAULT_NR_D
#define BLIS_HERK_NR_C BLIS_DEFAULT_NR_C
#define BLIS_HERK_NR_Z BLIS_DEFAULT_NR_Z
// Incremental pack blocking
#define BLIS_HERK_NI_S BLIS_DEFAULT_NI_S
#define BLIS_HERK_NI_D BLIS_DEFAULT_NI_D
#define BLIS_HERK_NI_C BLIS_DEFAULT_NI_C
#define BLIS_HERK_NI_Z BLIS_DEFAULT_NI_Z
void bli_herk_cntl_init()
{
// Create blocksize objects for each dimension.
herk_mc = bli_blksz_obj_create( BLIS_HERK_MC_S,
BLIS_HERK_MC_D,
BLIS_HERK_MC_C,
BLIS_HERK_MC_Z );
herk_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
BLIS_DEFAULT_MC_D,
BLIS_DEFAULT_MC_C,
BLIS_DEFAULT_MC_Z );
herk_nc = bli_blksz_obj_create( BLIS_HERK_NC_S,
BLIS_HERK_NC_D,
BLIS_HERK_NC_C,
BLIS_HERK_NC_Z );
herk_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
BLIS_DEFAULT_NC_D,
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
herk_kc = bli_blksz_obj_create( BLIS_HERK_KC_S,
BLIS_HERK_KC_D,
BLIS_HERK_KC_C,
BLIS_HERK_KC_Z );
herk_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
BLIS_DEFAULT_KC_D,
BLIS_DEFAULT_KC_C,
BLIS_DEFAULT_KC_Z );
herk_mr = bli_blksz_obj_create( BLIS_HERK_MR_S,
BLIS_HERK_MR_D,
BLIS_HERK_MR_C,
BLIS_HERK_MR_Z );
herk_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
BLIS_DEFAULT_MR_C,
BLIS_DEFAULT_MR_Z );
herk_nr = bli_blksz_obj_create( BLIS_HERK_NR_S,
BLIS_HERK_NR_D,
BLIS_HERK_NR_C,
BLIS_HERK_NR_Z );
herk_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
BLIS_DEFAULT_NR_D,
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
herk_kr = bli_blksz_obj_create( BLIS_HERK_KR_S,
BLIS_HERK_KR_D,
BLIS_HERK_KR_C,
BLIS_HERK_KR_Z );
herk_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
BLIS_DEFAULT_KR_D,
BLIS_DEFAULT_KR_C,
BLIS_DEFAULT_KR_Z );
herk_ni = bli_blksz_obj_create( BLIS_HERK_NI_S,
BLIS_HERK_NI_D,
BLIS_HERK_NI_C,
BLIS_HERK_NI_Z );
herk_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
herk_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
herk_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
BLIS_EXTEND_KR_D,
BLIS_EXTEND_KR_C,
BLIS_EXTEND_KR_Z );
herk_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
BLIS_DEFAULT_NI_D,
BLIS_DEFAULT_NI_C,
BLIS_DEFAULT_NI_Z );
// Create control tree objects for packm operations on a, b, and c.
@@ -142,8 +119,8 @@ void bli_herk_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
herk_mr,
herk_kr,
herk_mr, herk_extmr,
herk_kr, herk_extkr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -156,8 +133,8 @@ void bli_herk_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
herk_kr,
herk_nr,
herk_kr, herk_extkr,
herk_nr, herk_extnr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -170,8 +147,8 @@ void bli_herk_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
herk_mr,
herk_nr,
herk_mr, herk_extmr,
herk_nr, herk_extnr,
FALSE, // do NOT scale by beta
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal

View File

@@ -148,7 +148,7 @@ void PASTEMAC(ch,varname)( \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,kc) * \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \

View File

@@ -148,7 +148,7 @@ void PASTEMAC(ch,varname)( \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,kc) * \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \

View File

@@ -55,87 +55,64 @@ blksz_t* trmm_kc;
blksz_t* trmm_mr;
blksz_t* trmm_nr;
blksz_t* trmm_kr;
blksz_t* trmm_extmr;
blksz_t* trmm_extnr;
blksz_t* trmm_extkr;
blksz_t* trmm_ni;
// Cache blocksizes.
#define BLIS_TRMM_KC_S BLIS_DEFAULT_KC_S
#define BLIS_TRMM_KC_D BLIS_DEFAULT_KC_D
#define BLIS_TRMM_KC_C BLIS_DEFAULT_KC_C
#define BLIS_TRMM_KC_Z BLIS_DEFAULT_KC_Z
#define BLIS_TRMM_MC_S BLIS_DEFAULT_MC_S
#define BLIS_TRMM_MC_D BLIS_DEFAULT_MC_D
#define BLIS_TRMM_MC_C BLIS_DEFAULT_MC_C
#define BLIS_TRMM_MC_Z BLIS_DEFAULT_MC_Z
#define BLIS_TRMM_NC_S BLIS_DEFAULT_NC_S
#define BLIS_TRMM_NC_D BLIS_DEFAULT_NC_D
#define BLIS_TRMM_NC_C BLIS_DEFAULT_NC_C
#define BLIS_TRMM_NC_Z BLIS_DEFAULT_NC_Z
// Register blocking
#define BLIS_TRMM_KR_S BLIS_DEFAULT_KR_S
#define BLIS_TRMM_KR_D BLIS_DEFAULT_KR_D
#define BLIS_TRMM_KR_C BLIS_DEFAULT_KR_C
#define BLIS_TRMM_KR_Z BLIS_DEFAULT_KR_Z
#define BLIS_TRMM_MR_S BLIS_DEFAULT_MR_S
#define BLIS_TRMM_MR_D BLIS_DEFAULT_MR_D
#define BLIS_TRMM_MR_C BLIS_DEFAULT_MR_C
#define BLIS_TRMM_MR_Z BLIS_DEFAULT_MR_Z
#define BLIS_TRMM_NR_S BLIS_DEFAULT_NR_S
#define BLIS_TRMM_NR_D BLIS_DEFAULT_NR_D
#define BLIS_TRMM_NR_C BLIS_DEFAULT_NR_C
#define BLIS_TRMM_NR_Z BLIS_DEFAULT_NR_Z
// Incremental pack blocking
#define BLIS_TRMM_NI_S BLIS_DEFAULT_NI_S
#define BLIS_TRMM_NI_D BLIS_DEFAULT_NI_D
#define BLIS_TRMM_NI_C BLIS_DEFAULT_NI_C
#define BLIS_TRMM_NI_Z BLIS_DEFAULT_NI_Z
void bli_trmm_cntl_init()
{
// Create blocksize objects for each dimension.
trmm_mc = bli_blksz_obj_create( BLIS_TRMM_MC_S,
BLIS_TRMM_MC_D,
BLIS_TRMM_MC_C,
BLIS_TRMM_MC_Z );
trmm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
BLIS_DEFAULT_MC_D,
BLIS_DEFAULT_MC_C,
BLIS_DEFAULT_MC_Z );
trmm_nc = bli_blksz_obj_create( BLIS_TRMM_NC_S,
BLIS_TRMM_NC_D,
BLIS_TRMM_NC_C,
BLIS_TRMM_NC_Z );
trmm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
BLIS_DEFAULT_NC_D,
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
trmm_kc = bli_blksz_obj_create( BLIS_TRMM_KC_S,
BLIS_TRMM_KC_D,
BLIS_TRMM_KC_C,
BLIS_TRMM_KC_Z );
trmm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
BLIS_DEFAULT_KC_D,
BLIS_DEFAULT_KC_C,
BLIS_DEFAULT_KC_Z );
trmm_mr = bli_blksz_obj_create( BLIS_TRMM_MR_S,
BLIS_TRMM_MR_D,
BLIS_TRMM_MR_C,
BLIS_TRMM_MR_Z );
trmm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
BLIS_DEFAULT_MR_C,
BLIS_DEFAULT_MR_Z );
trmm_nr = bli_blksz_obj_create( BLIS_TRMM_NR_S,
BLIS_TRMM_NR_D,
BLIS_TRMM_NR_C,
BLIS_TRMM_NR_Z );
trmm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
BLIS_DEFAULT_NR_D,
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
trmm_kr = bli_blksz_obj_create( BLIS_TRMM_KR_S,
BLIS_TRMM_KR_D,
BLIS_TRMM_KR_C,
BLIS_TRMM_KR_Z );
trmm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
BLIS_DEFAULT_KR_D,
BLIS_DEFAULT_KR_C,
BLIS_DEFAULT_KR_Z );
trmm_ni = bli_blksz_obj_create( BLIS_TRMM_NI_S,
BLIS_TRMM_NI_D,
BLIS_TRMM_NI_C,
BLIS_TRMM_NI_Z );
trmm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
trmm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
trmm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
BLIS_EXTEND_KR_D,
BLIS_EXTEND_KR_C,
BLIS_EXTEND_KR_Z );
trmm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
BLIS_DEFAULT_NI_D,
BLIS_DEFAULT_NI_C,
BLIS_DEFAULT_NI_Z );
// Create control tree objects for packm operations on a, b, and c.
@@ -143,8 +120,10 @@ void bli_trmm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3, // pack panels of A compactly
trmm_mr, // IMPORTANT: for consistency with trsm, "k" dim
trmm_mr, // multiple is set to mr.
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to mr.
trmm_mr, trmm_extmr,
trmm_mr, trmm_extmr,
FALSE, // do NOT scale by alpha
TRUE, // densify
FALSE, // do NOT invert diagonal
@@ -157,8 +136,10 @@ void bli_trmm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
trmm_mr, // IMPORTANT: m dim multiple here must be mr
trmm_nr, // since "k" dim multiple is set to mr above.
// IMPORTANT: m dim multiple here must be mr
// since "k" dim multiple is set to mr above.
trmm_mr, trmm_extmr,
trmm_nr, trmm_extnr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -171,8 +152,8 @@ void bli_trmm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
trmm_mr,
trmm_nr,
trmm_mr, trmm_extmr,
trmm_nr, trmm_extmr,
FALSE, // do NOT scale by beta
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal

View File

@@ -149,7 +149,7 @@ void PASTEMAC(ch,varname)( \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,kc) * \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
@@ -165,6 +165,7 @@ void PASTEMAC(ch,varname)( \
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
@@ -252,7 +253,7 @@ void PASTEMAC(ch,varname)( \
k_nr = k_a1011 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * MR; \
rstep_a = k * PACKMR; \
\
cstep_b = ps_b; \
\
@@ -334,7 +335,7 @@ void PASTEMAC(ch,varname)( \
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1011 * MR; \
a1 += k_a1011 * PACKMR; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \

View File

@@ -149,7 +149,7 @@ void PASTEMAC(ch,varname)( \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,kc) * \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
@@ -165,6 +165,7 @@ void PASTEMAC(ch,varname)( \
/* Alias some constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
@@ -252,7 +253,7 @@ void PASTEMAC(ch,varname)( \
k_nr = k_a1112 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * MR; \
rstep_a = k * PACKMR; \
\
cstep_b = ps_b; \
\
@@ -337,7 +338,7 @@ void PASTEMAC(ch,varname)( \
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1112 * MR; \
a1 += k_a1112 * PACKMR; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \

View File

@@ -55,87 +55,64 @@ blksz_t* trmm3_kc;
blksz_t* trmm3_mr;
blksz_t* trmm3_nr;
blksz_t* trmm3_kr;
blksz_t* trmm3_extmr;
blksz_t* trmm3_extnr;
blksz_t* trmm3_extkr;
blksz_t* trmm3_ni;
// Cache blocksizes.
#define BLIS_TRMM3_KC_S BLIS_DEFAULT_KC_S
#define BLIS_TRMM3_KC_D BLIS_DEFAULT_KC_D
#define BLIS_TRMM3_KC_C BLIS_DEFAULT_KC_C
#define BLIS_TRMM3_KC_Z BLIS_DEFAULT_KC_Z
#define BLIS_TRMM3_MC_S BLIS_DEFAULT_MC_S
#define BLIS_TRMM3_MC_D BLIS_DEFAULT_MC_D
#define BLIS_TRMM3_MC_C BLIS_DEFAULT_MC_C
#define BLIS_TRMM3_MC_Z BLIS_DEFAULT_MC_Z
#define BLIS_TRMM3_NC_S BLIS_DEFAULT_NC_S
#define BLIS_TRMM3_NC_D BLIS_DEFAULT_NC_D
#define BLIS_TRMM3_NC_C BLIS_DEFAULT_NC_C
#define BLIS_TRMM3_NC_Z BLIS_DEFAULT_NC_Z
// Register blocking
#define BLIS_TRMM3_KR_S BLIS_DEFAULT_KR_S
#define BLIS_TRMM3_KR_D BLIS_DEFAULT_KR_D
#define BLIS_TRMM3_KR_C BLIS_DEFAULT_KR_C
#define BLIS_TRMM3_KR_Z BLIS_DEFAULT_KR_Z
#define BLIS_TRMM3_MR_S BLIS_DEFAULT_MR_S
#define BLIS_TRMM3_MR_D BLIS_DEFAULT_MR_D
#define BLIS_TRMM3_MR_C BLIS_DEFAULT_MR_C
#define BLIS_TRMM3_MR_Z BLIS_DEFAULT_MR_Z
#define BLIS_TRMM3_NR_S BLIS_DEFAULT_NR_S
#define BLIS_TRMM3_NR_D BLIS_DEFAULT_NR_D
#define BLIS_TRMM3_NR_C BLIS_DEFAULT_NR_C
#define BLIS_TRMM3_NR_Z BLIS_DEFAULT_NR_Z
// Incremental pack blocking
#define BLIS_TRMM3_NI_S BLIS_DEFAULT_NI_S
#define BLIS_TRMM3_NI_D BLIS_DEFAULT_NI_D
#define BLIS_TRMM3_NI_C BLIS_DEFAULT_NI_C
#define BLIS_TRMM3_NI_Z BLIS_DEFAULT_NI_Z
void bli_trmm3_cntl_init()
{
// Create blocksize objects for each dimension.
trmm3_mc = bli_blksz_obj_create( BLIS_TRMM3_MC_S,
BLIS_TRMM3_MC_D,
BLIS_TRMM3_MC_C,
BLIS_TRMM3_MC_Z );
trmm3_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
BLIS_DEFAULT_MC_D,
BLIS_DEFAULT_MC_C,
BLIS_DEFAULT_MC_Z );
trmm3_nc = bli_blksz_obj_create( BLIS_TRMM3_NC_S,
BLIS_TRMM3_NC_D,
BLIS_TRMM3_NC_C,
BLIS_TRMM3_NC_Z );
trmm3_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
BLIS_DEFAULT_NC_D,
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
trmm3_kc = bli_blksz_obj_create( BLIS_TRMM3_KC_S,
BLIS_TRMM3_KC_D,
BLIS_TRMM3_KC_C,
BLIS_TRMM3_KC_Z );
trmm3_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
BLIS_DEFAULT_KC_D,
BLIS_DEFAULT_KC_C,
BLIS_DEFAULT_KC_Z );
trmm3_mr = bli_blksz_obj_create( BLIS_TRMM3_MR_S,
BLIS_TRMM3_MR_D,
BLIS_TRMM3_MR_C,
BLIS_TRMM3_MR_Z );
trmm3_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
BLIS_DEFAULT_MR_C,
BLIS_DEFAULT_MR_Z );
trmm3_nr = bli_blksz_obj_create( BLIS_TRMM3_NR_S,
BLIS_TRMM3_NR_D,
BLIS_TRMM3_NR_C,
BLIS_TRMM3_NR_Z );
trmm3_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
BLIS_DEFAULT_NR_D,
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
trmm3_kr = bli_blksz_obj_create( BLIS_TRMM3_KR_S,
BLIS_TRMM3_KR_D,
BLIS_TRMM3_KR_C,
BLIS_TRMM3_KR_Z );
trmm3_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
BLIS_DEFAULT_KR_D,
BLIS_DEFAULT_KR_C,
BLIS_DEFAULT_KR_Z );
trmm3_ni = bli_blksz_obj_create( BLIS_TRMM3_NI_S,
BLIS_TRMM3_NI_D,
BLIS_TRMM3_NI_C,
BLIS_TRMM3_NI_Z );
trmm3_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
trmm3_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
trmm3_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
BLIS_EXTEND_KR_D,
BLIS_EXTEND_KR_C,
BLIS_EXTEND_KR_Z );
trmm3_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
BLIS_DEFAULT_NI_D,
BLIS_DEFAULT_NI_C,
BLIS_DEFAULT_NI_Z );
// Create control tree objects for packm operations on a, b, and c.
@@ -143,8 +120,10 @@ void bli_trmm3_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3, // pack panels of A compactly
trmm3_mr, // IMPORTANT: for consistency with trsm, "k" dim
trmm3_mr, // multiple is set to mr.
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to mr.
trmm3_mr, trmm3_extmr,
trmm3_mr, trmm3_extmr,
FALSE, // do NOT scale by alpha
TRUE, // densify
FALSE, // do NOT invert diagonal
@@ -157,8 +136,10 @@ void bli_trmm3_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
trmm3_mr, // IMPORTANT: m dim multiple here must be mr
trmm3_nr, // since "k" dim multiple is set to mr above.
// IMPORTANT: m dim multiple here must be mr
// since "k" dim multiple is set to mr above.
trmm3_mr, trmm3_extmr,
trmm3_nr, trmm3_extnr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -171,8 +152,8 @@ void bli_trmm3_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
trmm3_mr,
trmm3_nr,
trmm3_mr, trmm3_extmr,
trmm3_nr, trmm3_extnr,
FALSE, // do NOT scale by beta
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal

View File

@@ -55,87 +55,64 @@ blksz_t* trsm_kc;
blksz_t* trsm_mr;
blksz_t* trsm_nr;
blksz_t* trsm_kr;
blksz_t* trsm_extmr;
blksz_t* trsm_extnr;
blksz_t* trsm_extkr;
blksz_t* trsm_ni;
// Cache blocksizes.
#define BLIS_TRSM_KC_S BLIS_DEFAULT_KC_S
#define BLIS_TRSM_KC_D BLIS_DEFAULT_KC_D
#define BLIS_TRSM_KC_C BLIS_DEFAULT_KC_C
#define BLIS_TRSM_KC_Z BLIS_DEFAULT_KC_Z
#define BLIS_TRSM_MC_S BLIS_DEFAULT_MC_S
#define BLIS_TRSM_MC_D BLIS_DEFAULT_MC_D
#define BLIS_TRSM_MC_C BLIS_DEFAULT_MC_C
#define BLIS_TRSM_MC_Z BLIS_DEFAULT_MC_Z
#define BLIS_TRSM_NC_S BLIS_DEFAULT_NC_S
#define BLIS_TRSM_NC_D BLIS_DEFAULT_NC_D
#define BLIS_TRSM_NC_C BLIS_DEFAULT_NC_C
#define BLIS_TRSM_NC_Z BLIS_DEFAULT_NC_Z
// Register blocking
#define BLIS_TRSM_KR_S BLIS_DEFAULT_KR_S
#define BLIS_TRSM_KR_D BLIS_DEFAULT_KR_D
#define BLIS_TRSM_KR_C BLIS_DEFAULT_KR_C
#define BLIS_TRSM_KR_Z BLIS_DEFAULT_KR_Z
#define BLIS_TRSM_MR_S BLIS_DEFAULT_MR_S
#define BLIS_TRSM_MR_D BLIS_DEFAULT_MR_D
#define BLIS_TRSM_MR_C BLIS_DEFAULT_MR_C
#define BLIS_TRSM_MR_Z BLIS_DEFAULT_MR_Z
#define BLIS_TRSM_NR_S BLIS_DEFAULT_NR_S
#define BLIS_TRSM_NR_D BLIS_DEFAULT_NR_D
#define BLIS_TRSM_NR_C BLIS_DEFAULT_NR_C
#define BLIS_TRSM_NR_Z BLIS_DEFAULT_NR_Z
// Incremental pack blocking
#define BLIS_TRSM_NI_S BLIS_DEFAULT_NI_S
#define BLIS_TRSM_NI_D BLIS_DEFAULT_NI_D
#define BLIS_TRSM_NI_C BLIS_DEFAULT_NI_C
#define BLIS_TRSM_NI_Z BLIS_DEFAULT_NI_Z
void bli_trsm_cntl_init()
{
// Create blocksize objects for each dimension.
trsm_mc = bli_blksz_obj_create( BLIS_TRSM_MC_S,
BLIS_TRSM_MC_D,
BLIS_TRSM_MC_C,
BLIS_TRSM_MC_Z );
trsm_mc = bli_blksz_obj_create( BLIS_DEFAULT_MC_S,
BLIS_DEFAULT_MC_D,
BLIS_DEFAULT_MC_C,
BLIS_DEFAULT_MC_Z );
trsm_nc = bli_blksz_obj_create( BLIS_TRSM_NC_S,
BLIS_TRSM_NC_D,
BLIS_TRSM_NC_C,
BLIS_TRSM_NC_Z );
trsm_nc = bli_blksz_obj_create( BLIS_DEFAULT_NC_S,
BLIS_DEFAULT_NC_D,
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
trsm_kc = bli_blksz_obj_create( BLIS_TRSM_KC_S,
BLIS_TRSM_KC_D,
BLIS_TRSM_KC_C,
BLIS_TRSM_KC_Z );
trsm_kc = bli_blksz_obj_create( BLIS_DEFAULT_KC_S,
BLIS_DEFAULT_KC_D,
BLIS_DEFAULT_KC_C,
BLIS_DEFAULT_KC_Z );
trsm_mr = bli_blksz_obj_create( BLIS_TRSM_MR_S,
BLIS_TRSM_MR_D,
BLIS_TRSM_MR_C,
BLIS_TRSM_MR_Z );
trsm_mr = bli_blksz_obj_create( BLIS_DEFAULT_MR_S,
BLIS_DEFAULT_MR_D,
BLIS_DEFAULT_MR_C,
BLIS_DEFAULT_MR_Z );
trsm_nr = bli_blksz_obj_create( BLIS_TRSM_NR_S,
BLIS_TRSM_NR_D,
BLIS_TRSM_NR_C,
BLIS_TRSM_NR_Z );
trsm_nr = bli_blksz_obj_create( BLIS_DEFAULT_NR_S,
BLIS_DEFAULT_NR_D,
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
trsm_kr = bli_blksz_obj_create( BLIS_TRSM_KR_S,
BLIS_TRSM_KR_D,
BLIS_TRSM_KR_C,
BLIS_TRSM_KR_Z );
trsm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S,
BLIS_DEFAULT_KR_D,
BLIS_DEFAULT_KR_C,
BLIS_DEFAULT_KR_Z );
trsm_ni = bli_blksz_obj_create( BLIS_TRSM_NI_S,
BLIS_TRSM_NI_D,
BLIS_TRSM_NI_C,
BLIS_TRSM_NI_Z );
trsm_extmr = bli_blksz_obj_create( BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
trsm_extnr = bli_blksz_obj_create( BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
trsm_extkr = bli_blksz_obj_create( BLIS_EXTEND_KR_S,
BLIS_EXTEND_KR_D,
BLIS_EXTEND_KR_C,
BLIS_EXTEND_KR_Z );
trsm_ni = bli_blksz_obj_create( BLIS_DEFAULT_NI_S,
BLIS_DEFAULT_NI_D,
BLIS_DEFAULT_NI_C,
BLIS_DEFAULT_NI_Z );
// Create control tree objects for packm operations on a, b, and c.
@@ -143,8 +120,10 @@ void bli_trsm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3, // pack panels of A compactly
trsm_mr, // IMPORTANT: n dim multiple must be mr to
trsm_mr, // support right and bottom-right edge cases
// IMPORTANT: n dim multiple must be mr to
// support right and bottom-right edge cases
trsm_mr, trsm_extmr,
trsm_mr, trsm_extmr,
FALSE, // do NOT scale by alpha
TRUE, // densify
TRUE, // invert diagonal
@@ -157,8 +136,10 @@ void bli_trsm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
trsm_mr, // IMPORTANT: m dim multiple must be mr since
trsm_nr, // B_pack is updated (ie: serves as C) in trsm
// IMPORTANT: m dim multiple must be mr since
// B_pack is updated (ie: serves as C) in trsm
trsm_mr, trsm_extmr,
trsm_nr, trsm_extnr,
FALSE, // do NOT scale by alpha
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal
@@ -171,8 +152,8 @@ void bli_trsm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
trsm_mr,
trsm_nr,
trsm_mr, trsm_extmr,
trsm_nr, trsm_extnr,
FALSE, // do NOT scale by beta
FALSE, // already dense; densify not necessary
FALSE, // do NOT invert diagonal

View File

@@ -138,7 +138,7 @@ void PASTEMAC(ch,varname)( \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,kc) * \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
@@ -154,6 +154,8 @@ void PASTEMAC(ch,varname)( \
/* Alias constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
@@ -252,7 +254,7 @@ void PASTEMAC(ch,varname)( \
k_nr = k_a1011 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * MR; \
rstep_a = k * PACKMR; \
\
cstep_b = ps_b; \
\
@@ -305,14 +307,14 @@ void PASTEMAC(ch,varname)( \
k_a1011 = bli_min( k, diagoffa_i + MR ); \
k_a10 = k_a1011 - MR; \
\
b11 = b1 + diagoffa_i * NR; \
b11 = b1 + diagoffa_i * PACKNR; \
bp_i = bp + off_a1011 * NR * NDUP; \
\
/* Compute the addresses of the A10 panel and triangular
block A11, and the corresponding panel Bd01 and block
Bd11. */ \
a10 = a1; \
a11 = a1 + k_a10 * MR; \
a11 = a1 + k_a10 * PACKMR; \
bp01 = bp_i; \
bp11 = bp_i + k_a10 * NR * NDUP; \
\
@@ -354,7 +356,7 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_l_ker_var2: bp11 (diag)", MR, NR, bp11, NR,
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1011 * MR; \
a1 += k_a1011 * PACKMR; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \

View File

@@ -138,7 +138,7 @@ void PASTEMAC(ch,varname)( \
) \
{ \
/* Temporary buffer for duplicating elements of B. */ \
ctype bd[ PASTEMAC(ch,kc) * \
ctype bd[ PASTEMAC(ch,maxkc) * \
PASTEMAC(ch,nr) * \
PASTEMAC(ch,ndup) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
@@ -154,6 +154,8 @@ void PASTEMAC(ch,varname)( \
/* Alias constants to shorter names. */ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
const dim_t PACKMR = PASTEMAC(ch,packmr); \
const dim_t PACKNR = PASTEMAC(ch,packnr); \
const dim_t NDUP = PASTEMAC(ch,ndup); \
const bool_t DUPB = NDUP != 1; \
\
@@ -252,7 +254,7 @@ void PASTEMAC(ch,varname)( \
k_nr = k_a1112 * NR; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = k * MR; \
rstep_a = k * PACKMR; \
\
cstep_b = ps_b; \
\
@@ -310,14 +312,14 @@ void PASTEMAC(ch,varname)( \
/* Index into b1 (if the diagonal offset is positive) to
locate the MR x NR block of b1 that will be updated by the
trsm subproblem. */ \
b11 = b1 + off_a1112 * NR; \
b11 = b1 + off_a1112 * PACKNR; \
bp_i = bp + off_a1112 * NR * NDUP; \
\
/* Compute the addresses of the A12 panel and triangular
block A11, and the corresponding panel Bd21 and block
Bd11. */ \
a11 = a1; \
a12 = a1 + k_a11 * MR; \
a12 = a1 + k_a11 * PACKMR; \
bp11 = bp_i; \
bp21 = bp_i + k_a11 * NR * NDUP; \
\
@@ -374,7 +376,7 @@ PASTEMAC(ch,fprintm)( stdout, "trsm_u_ker_var2: ct after (diag)", m_cur, n_cur,
c11, rs_c, cs_c ); \
} \
\
a1 += k_a1112 * MR; \
a1 += k_a1112 * PACKMR; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \

View File

@@ -49,9 +49,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \
const dim_t NR = PASTEMAC(ch,nr); \
\
const inc_t rs_b = NR; \
const inc_t rs_b = PASTEMAC(ch,packnr); \
const inc_t cs_b = 1; \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \

View File

@@ -49,9 +49,7 @@ void PASTEMAC(ch,varname)( \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \
const dim_t NR = PASTEMAC(ch,nr); \
\
const inc_t rs_b = NR; \
const inc_t rs_b = PASTEMAC(ch,packnr); \
const inc_t cs_b = 1; \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \

View File

@@ -45,16 +45,13 @@ void PASTEMAC(ch,varname)( \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
\
const dim_t m = MR; \
const dim_t n = NR; \
const dim_t m = PASTEMAC(ch,mr); \
const dim_t n = PASTEMAC(ch,nr); \
\
const inc_t rs_a = 1; \
const inc_t cs_a = MR; \
const inc_t cs_a = PASTEMAC(ch,packmr); \
\
const inc_t rs_b = NR; \
const inc_t rs_b = PASTEMAC(ch,packnr); \
const inc_t cs_b = 1; \
\
dim_t iter, i, j, k; \

View File

@@ -45,16 +45,13 @@ void PASTEMAC(ch,varname)( \
ctype* restrict c, inc_t rs_c, inc_t cs_c \
) \
{ \
const dim_t MR = PASTEMAC(ch,mr); \
const dim_t NR = PASTEMAC(ch,nr); \
\
const dim_t m = MR; \
const dim_t n = NR; \
const dim_t m = PASTEMAC(ch,mr); \
const dim_t n = PASTEMAC(ch,nr); \
\
const inc_t rs_a = 1; \
const inc_t cs_a = MR; \
const inc_t cs_a = PASTEMAC(ch,packmr); \
\
const inc_t rs_b = NR; \
const inc_t rs_b = PASTEMAC(ch,packnr); \
const inc_t cs_b = 1; \
\
dim_t iter, i, j, k; \

View File

@@ -81,11 +81,19 @@ dim_t bli_blksz_for_obj( obj_t* obj,
return b->v[ bli_obj_datatype( *obj ) ];
}
extern blksz_t* gemm_mc;
extern blksz_t* gemm_nc;
extern blksz_t* gemm_kc;
extern blksz_t* gemm_mr;
extern blksz_t* gemm_nr;
extern blksz_t* gemm_kr;
dim_t bli_determine_blocksize_f( dim_t i,
dim_t dim,
obj_t* obj,
blksz_t* b )
{
#if 0
num_t dt;
dim_t b_alg;
@@ -103,7 +111,138 @@ dim_t bli_determine_blocksize_f( dim_t i,
// smaller, in which case we return that remaining value.
b_alg = bli_min( b_alg, dim - i );
//printf( "bli_determine_blocksize0: returning %lu\n", b_alg );
return b_alg;
#endif
#if 0
num_t dt;
dim_t b_alg, b_now;
dim_t mc, nc, kc;
dim_t mr, nr, kr;
dim_t dim_left_now;
dt = bli_obj_execution_datatype( *obj );
b_alg = bli_blksz_for_type( dt, b );
mc = bli_blksz_for_type( dt, gemm_mc );
nc = bli_blksz_for_type( dt, gemm_nc );
kc = bli_blksz_for_type( dt, gemm_kc );
mr = bli_blksz_for_type( dt, gemm_mr );
nr = bli_blksz_for_type( dt, gemm_nr );
kr = bli_blksz_for_type( dt, gemm_kr );
dim_left_now = dim - i;
if ( dim_left_now <= b_alg )
{
b_now = dim_left_now;
}
else if ( dim_left_now <= b_alg + (b_alg/4) )
{
b_now = dim_left_now / 2;
// This actually wno't work when, for example, mc == kc but mr != kr.
if ( b_alg == mc ) b_now = bli_align_dim_to_mult( b_now, mr );
else if ( b_alg == nc ) b_now = bli_align_dim_to_mult( b_now, nr );
else if ( b_alg == kc ) b_now = bli_align_dim_to_mult( b_now, kr );
}
else
{
b_now = b_alg;
}
//printf( "bli_determine_blocksize1: returning %lu\n", b_now );
return b_now;
#endif
#if 0
num_t dt;
dim_t b_alg, b_now;
dim_t mc, nc, kc;
dim_t mr, nr, kr;
dim_t dim_left_now;
dt = bli_obj_execution_datatype( *obj );
b_alg = bli_blksz_for_type( dt, b );
mc = bli_blksz_for_type( dt, gemm_mc );
nc = bli_blksz_for_type( dt, gemm_nc );
kc = bli_blksz_for_type( dt, gemm_kc );
mr = bli_blksz_for_type( dt, gemm_mr );
nr = bli_blksz_for_type( dt, gemm_nr );
kr = bli_blksz_for_type( dt, gemm_kr );
dim_left_now = dim - i;
if ( dim_left_now <= b_alg )
{
b_now = dim_left_now;
}
else if ( dim_left_now <= 2 * b_alg )
{
b_now = dim_left_now / 2;
// This actually wno't work when, for example, mc == kc but mr != kr.
if ( b_alg == mc ) b_now = bli_align_dim_to_mult( b_now, mr );
else if ( b_alg == nc ) b_now = bli_align_dim_to_mult( b_now, nr );
else if ( b_alg == kc ) b_now = bli_align_dim_to_mult( b_now, kr );
}
else
{
b_now = b_alg;
}
//printf( "bli_determine_blocksize2: returning %lu\n", b_now );
return b_now;
#endif
#ifdef BLIS_EDGECASE_HACK
num_t dt;
dim_t b_alg, b_now;
dim_t dim_left_now;
dt = bli_obj_execution_datatype( *obj );
b_alg = bli_blksz_for_type( dt, b );
dim_left_now = dim - i;
if ( dim_left_now <= b_alg + b_alg/4 )
{
b_now = dim_left_now;
}
else
{
b_now = b_alg;
}
return b_now;
#else
num_t dt;
dim_t b_alg;
// We assume that this function is being called from an algorithm that
// is moving "forward" (ie: top to bottom, left to right, top-left
// to bottom-right).
// Extract the execution datatype and use it to query the corresponding
// blocksize value from the blksz_t object.
dt = bli_obj_execution_datatype( *obj );
b_alg = bli_blksz_for_type( dt, b );
// If we are moving "forward" (ie: top to bottom, left to right, or
// top-left to bottom-right), then return b_alg, unless dim - 1 is
// smaller, in which case we return that remaining value.
b_alg = bli_min( b_alg, dim - i );
return b_alg;
#endif
}
dim_t bli_determine_blocksize_b( dim_t i,

View File

@@ -37,9 +37,9 @@
// Define the size of pool blocks. These may be adjusted so that they can
// handle inflated blocksizes at edge cases.
#define BLIS_POOL_MC_D BLIS_DEFAULT_MC_D
#define BLIS_POOL_KC_D BLIS_DEFAULT_KC_D
#define BLIS_POOL_NC_D BLIS_DEFAULT_NC_D
#define BLIS_POOL_MC_D ( ( BLIS_MAXIMUM_MC_D * BLIS_PACKDIM_MR_D ) / BLIS_DEFAULT_MR_D )
#define BLIS_POOL_KC_D ( ( BLIS_MAXIMUM_KC_D * BLIS_PACKDIM_KR_D ) / BLIS_DEFAULT_KR_D )
#define BLIS_POOL_NC_D ( ( BLIS_MAXIMUM_NC_D * BLIS_PACKDIM_NR_D ) / BLIS_DEFAULT_NR_D )
// Define each pool's block size.
// NOTE: Here we assume the "worst" case of the register blocking

View File

@@ -510,8 +510,8 @@ void bli_obj_print( char* label, obj_t* obj )
fprintf( file, " - buf %p\n", bli_mem_buffer( pack_mem ) );
fprintf( file, " - buf_type %u\n", bli_mem_buf_type( pack_mem ) );
fprintf( file, " - size %lu\n", bli_mem_size( pack_mem ) );
fprintf( file, " m_packed %lu\n", bli_obj_packed_length( *obj ) );
fprintf( file, " n_packed %lu\n", bli_obj_packed_width( *obj ) );
fprintf( file, " m_padded %lu\n", bli_obj_padded_length( *obj ) );
fprintf( file, " n_padded %lu\n", bli_obj_padded_width( *obj ) );
fprintf( file, " ps %lu\n", bli_obj_panel_stride( *obj ) );
fprintf( file, "\n" );

View File

@@ -35,52 +35,214 @@
#ifndef BLIS_KERNEL_MACRO_DEFS_H
#define BLIS_KERNEL_MACRO_DEFS_H
#define SIZEOF_S 4
#define SIZEOF_D 8
#define SIZEOF_C 8
#define SIZEOF_Z 16
// Redefine kernel blocksizes, defined in bli_kernel.h, to shorter
// names that can be derived via PASTEMAC macro.
// Cache blocksizes
// -- Kernel macro checks ------------------------------------------------------
#define bli_smc BLIS_DEFAULT_MC_S
#define bli_snc BLIS_DEFAULT_NC_S
#define bli_skc BLIS_DEFAULT_KC_S
// Verify that cache blocksizes are whole multiples of register blocksizes.
// Specifically, verify that:
// - MC is a whole multiple of MR.
// - NC is a whole multiple of NR.
// - KC is a whole multiple of KR.
// These constraints are enforced because it makes it easier to handle diagonals
// in the macro-kernel implementations.
#if ( \
( BLIS_DEFAULT_MC_S % BLIS_DEFAULT_MR_S != 0 ) || \
( BLIS_DEFAULT_MC_D % BLIS_DEFAULT_MR_D != 0 ) || \
( BLIS_DEFAULT_MC_C % BLIS_DEFAULT_MR_C != 0 ) || \
( BLIS_DEFAULT_MC_Z % BLIS_DEFAULT_MR_Z != 0 ) \
)
#error MC must be multiple of MR for all datatypes.
#endif
#define bli_dmc BLIS_DEFAULT_MC_D
#define bli_dnc BLIS_DEFAULT_NC_D
#define bli_dkc BLIS_DEFAULT_KC_D
#if ( \
( BLIS_DEFAULT_NC_S % BLIS_DEFAULT_NR_S != 0 ) || \
( BLIS_DEFAULT_NC_D % BLIS_DEFAULT_NR_D != 0 ) || \
( BLIS_DEFAULT_NC_C % BLIS_DEFAULT_NR_C != 0 ) || \
( BLIS_DEFAULT_NC_Z % BLIS_DEFAULT_NR_Z != 0 ) \
)
#error NC must be multiple of NR for all datatypes.
#endif
#define bli_cmc BLIS_DEFAULT_MC_C
#define bli_cnc BLIS_DEFAULT_NC_C
#define bli_ckc BLIS_DEFAULT_KC_C
#if ( \
( BLIS_DEFAULT_KC_S % BLIS_DEFAULT_KR_S != 0 ) || \
( BLIS_DEFAULT_KC_D % BLIS_DEFAULT_KR_D != 0 ) || \
( BLIS_DEFAULT_KC_C % BLIS_DEFAULT_KR_C != 0 ) || \
( BLIS_DEFAULT_KC_Z % BLIS_DEFAULT_KR_Z != 0 ) \
)
#error KC must be multiple of KR for all datatypes.
#endif
#define bli_zmc BLIS_DEFAULT_MC_Z
#define bli_znc BLIS_DEFAULT_NC_Z
#define bli_zkc BLIS_DEFAULT_KC_Z
// Verify that cache blocksizes indicate consistent storage.
// Specifically, verify that:
// - MC_D * KC_D >= MC_? * KC_?.
// - KC_D * NC_D >= KC_? * NC_?.
// - MC_D * NC_D >= MC_? * NC_?.
// These constraints are enforced because static memory is allocated for the
// contiguous memory allocator using the double-precision real values of MC,
// NC, and KC.
#if ( \
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_MC_S * BLIS_DEFAULT_KC_S * SIZEOF_S ) ) || \
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_MC_C * BLIS_DEFAULT_KC_C * SIZEOF_C ) ) || \
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_KC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_MC_Z * BLIS_DEFAULT_KC_Z * SIZEOF_Z ) ) \
)
#error MC_D*KC_D must be >= that of MC*KC for all other datatypes.
#endif
#if ( \
( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_KC_S * BLIS_DEFAULT_NC_S * SIZEOF_S ) ) || \
( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_KC_C * BLIS_DEFAULT_NC_C * SIZEOF_C ) ) || \
( ( BLIS_DEFAULT_KC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_KC_Z * BLIS_DEFAULT_NC_Z * SIZEOF_Z ) ) \
)
#error KC_D*NC_D must be >= that of KC*NC for all other datatypes.
#endif
/*
#if ( \
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_MC_S * BLIS_DEFAULT_NC_S * SIZEOF_S ) ) || \
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_MC_C * BLIS_DEFAULT_NC_C * SIZEOF_C ) ) || \
( ( BLIS_DEFAULT_MC_D * BLIS_DEFAULT_NC_D * SIZEOF_D ) < \
( BLIS_DEFAULT_MC_Z * BLIS_DEFAULT_NC_Z * SIZEOF_Z ) ) \
)
#error MC_D*NC_D must be >= that of MC*NC for all other datatypes.
#endif
*/
// -- Compute maximum cache blocksizes -----------------------------------------
#define BLIS_MAXIMUM_MC_S ( BLIS_DEFAULT_MC_S + BLIS_EXTEND_MC_S )
#define BLIS_MAXIMUM_KC_S ( BLIS_DEFAULT_KC_S + BLIS_EXTEND_KC_S )
#define BLIS_MAXIMUM_NC_S ( BLIS_DEFAULT_NC_S + BLIS_EXTEND_NC_S )
#define BLIS_MAXIMUM_MC_D ( BLIS_DEFAULT_MC_D + BLIS_EXTEND_MC_D )
#define BLIS_MAXIMUM_KC_D ( BLIS_DEFAULT_KC_D + BLIS_EXTEND_KC_D )
#define BLIS_MAXIMUM_NC_D ( BLIS_DEFAULT_NC_D + BLIS_EXTEND_NC_D )
#define BLIS_MAXIMUM_MC_C ( BLIS_DEFAULT_MC_C + BLIS_EXTEND_MC_C )
#define BLIS_MAXIMUM_KC_C ( BLIS_DEFAULT_KC_C + BLIS_EXTEND_KC_C )
#define BLIS_MAXIMUM_NC_C ( BLIS_DEFAULT_NC_C + BLIS_EXTEND_NC_C )
#define BLIS_MAXIMUM_MC_Z ( BLIS_DEFAULT_MC_Z + BLIS_EXTEND_MC_Z )
#define BLIS_MAXIMUM_KC_Z ( BLIS_DEFAULT_KC_Z + BLIS_EXTEND_KC_Z )
#define BLIS_MAXIMUM_NC_Z ( BLIS_DEFAULT_NC_Z + BLIS_EXTEND_NC_Z )
// -- Compute leading dim blocksizes used for packing --------------------------
#define BLIS_PACKDIM_MR_S ( BLIS_DEFAULT_MR_S + BLIS_EXTEND_MR_S )
#define BLIS_PACKDIM_KR_S ( BLIS_DEFAULT_KR_S + BLIS_EXTEND_KR_S )
#define BLIS_PACKDIM_NR_S ( BLIS_DEFAULT_NR_S + BLIS_EXTEND_NR_S )
#define BLIS_PACKDIM_MR_D ( BLIS_DEFAULT_MR_D + BLIS_EXTEND_MR_D )
#define BLIS_PACKDIM_KR_D ( BLIS_DEFAULT_KR_D + BLIS_EXTEND_KR_D )
#define BLIS_PACKDIM_NR_D ( BLIS_DEFAULT_NR_D + BLIS_EXTEND_NR_D )
#define BLIS_PACKDIM_MR_C ( BLIS_DEFAULT_MR_C + BLIS_EXTEND_MR_C )
#define BLIS_PACKDIM_KR_C ( BLIS_DEFAULT_KR_C + BLIS_EXTEND_KR_C )
#define BLIS_PACKDIM_NR_C ( BLIS_DEFAULT_NR_C + BLIS_EXTEND_NR_C )
#define BLIS_PACKDIM_MR_Z ( BLIS_DEFAULT_MR_Z + BLIS_EXTEND_MR_Z )
#define BLIS_PACKDIM_KR_Z ( BLIS_DEFAULT_KR_Z + BLIS_EXTEND_KR_Z )
#define BLIS_PACKDIM_NR_Z ( BLIS_DEFAULT_NR_Z + BLIS_EXTEND_NR_Z )
// -- Abbreiviated kernel blocksize macros -------------------------------------
// Here, we shorten the blocksizes defined in bli_kernel.h so that they can
// derived via the PASTEMAC macro.
// Default cache blocksizes
#define bli_smc BLIS_DEFAULT_MC_S
#define bli_skc BLIS_DEFAULT_KC_S
#define bli_snc BLIS_DEFAULT_NC_S
#define bli_dmc BLIS_DEFAULT_MC_D
#define bli_dkc BLIS_DEFAULT_KC_D
#define bli_dnc BLIS_DEFAULT_NC_D
#define bli_cmc BLIS_DEFAULT_MC_C
#define bli_ckc BLIS_DEFAULT_KC_C
#define bli_cnc BLIS_DEFAULT_NC_C
#define bli_zmc BLIS_DEFAULT_MC_Z
#define bli_zkc BLIS_DEFAULT_KC_Z
#define bli_znc BLIS_DEFAULT_NC_Z
// Maximum cache blocksizes
#define bli_smaxmc BLIS_MAXIMUM_MC_S
#define bli_smaxkc BLIS_MAXIMUM_KC_S
#define bli_smaxnc BLIS_MAXIMUM_NC_S
#define bli_dmaxmc BLIS_MAXIMUM_MC_D
#define bli_dmaxkc BLIS_MAXIMUM_KC_D
#define bli_dmaxnc BLIS_MAXIMUM_NC_D
#define bli_cmaxmc BLIS_MAXIMUM_MC_C
#define bli_cmaxkc BLIS_MAXIMUM_KC_C
#define bli_cmaxnc BLIS_MAXIMUM_NC_C
#define bli_zmaxmc BLIS_MAXIMUM_MC_Z
#define bli_zmaxkc BLIS_MAXIMUM_KC_Z
#define bli_zmaxnc BLIS_MAXIMUM_NC_Z
// Register blocksizes
#define bli_smr BLIS_DEFAULT_MR_S
#define bli_snr BLIS_DEFAULT_NR_S
#define bli_skr BLIS_DEFAULT_KR_S
#define bli_smr BLIS_DEFAULT_MR_S
#define bli_skr BLIS_DEFAULT_KR_S
#define bli_snr BLIS_DEFAULT_NR_S
#define bli_dmr BLIS_DEFAULT_MR_D
#define bli_dnr BLIS_DEFAULT_NR_D
#define bli_dkr BLIS_DEFAULT_KR_D
#define bli_dmr BLIS_DEFAULT_MR_D
#define bli_dkr BLIS_DEFAULT_KR_D
#define bli_dnr BLIS_DEFAULT_NR_D
#define bli_cmr BLIS_DEFAULT_MR_C
#define bli_cnr BLIS_DEFAULT_NR_C
#define bli_ckr BLIS_DEFAULT_KR_C
#define bli_cmr BLIS_DEFAULT_MR_C
#define bli_ckr BLIS_DEFAULT_KR_C
#define bli_cnr BLIS_DEFAULT_NR_C
#define bli_zmr BLIS_DEFAULT_MR_Z
#define bli_znr BLIS_DEFAULT_NR_Z
#define bli_zkr BLIS_DEFAULT_KR_Z
#define bli_zmr BLIS_DEFAULT_MR_Z
#define bli_zkr BLIS_DEFAULT_KR_Z
#define bli_znr BLIS_DEFAULT_NR_Z
// Duplication
// Micro-panel packing register blocksizes
#define bli_spackmr BLIS_PACKDIM_MR_S
#define bli_spackkr BLIS_PACKDIM_KR_S
#define bli_spacknr BLIS_PACKDIM_NR_S
#define bli_dpackmr BLIS_PACKDIM_MR_D
#define bli_dpackkr BLIS_PACKDIM_KR_D
#define bli_dpacknr BLIS_PACKDIM_NR_D
#define bli_cpackmr BLIS_PACKDIM_MR_C
#define bli_cpackkr BLIS_PACKDIM_KR_C
#define bli_cpacknr BLIS_PACKDIM_NR_C
#define bli_zpackmr BLIS_PACKDIM_MR_Z
#define bli_zpackkr BLIS_PACKDIM_KR_Z
#define bli_zpacknr BLIS_PACKDIM_NR_Z
// Duplication factors
#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S
#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S
#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
#endif

View File

@@ -710,30 +710,44 @@ bli_obj_width_stored( obj )
// Packed dimensions query
#define bli_obj_packed_length( obj ) \
#define bli_obj_padded_length( obj ) \
\
( (obj).m_packed )
( (obj).m_padded )
#define bli_obj_packed_width( obj ) \
#define bli_obj_padded_width( obj ) \
\
( (obj).n_packed )
( (obj).n_padded )
// Packed dimensions modification
#define bli_obj_set_packed_length( m0, obj ) \
#define bli_obj_set_padded_length( m0, obj ) \
{ \
(obj).m_packed = m0; \
(obj).m_padded = m0; \
}
#define bli_obj_set_packed_width( n0, obj ) \
#define bli_obj_set_padded_width( n0, obj ) \
{ \
(obj).n_packed = n0; \
(obj).n_padded = n0; \
}
#define bli_obj_set_packed_dims( m0, n0, obj ) \
#define bli_obj_set_padded_dims( m0, n0, obj ) \
{ \
bli_obj_set_packed_length( m0, obj ); \
bli_obj_set_packed_width( n0, obj ); \
bli_obj_set_padded_length( m0, obj ); \
bli_obj_set_padded_width( n0, obj ); \
}
// Packed panel dimension query
#define bli_obj_panel_dim( obj ) \
\
((obj).pd)
// Packed panel dimension modification
#define bli_obj_set_panel_dim( panel_dim, obj ) \
{ \
(obj).pd = panel_dim; \
}

View File

@@ -388,9 +388,11 @@ typedef struct obj_s
// Pack-related fields
mem_t pack_mem; // cached memory region for packing
dim_t m_packed;
dim_t n_packed;
dim_t m_padded; // m dimension of matrix, including any padding
dim_t n_padded; // n dimension of matrix, including any padding
inc_t ps; // panel stride (distance to next panel)
inc_t pd; // panel dimension (the "width" of a panel:
// usually MR or NR)
//mem_t cast_mem; // cached memory region for casting
@@ -445,8 +447,9 @@ typedef struct obj_s
those situations, we want the subpartition to inherit the pack_mem
field, and the corresponding packed dimensions, of its parent. */ \
(b).pack_mem = (a).pack_mem; \
(b).m_packed = (a).m_packed; \
(b).n_packed = (a).n_packed; \
(b).m_padded = (a).m_padded; \
(b).n_padded = (a).n_padded; \
(b).pd = (a).pd; \
(b).ps = (a).ps; \
\
/*(b).cast_mem = (a).cast_mem;*/ \

View File

@@ -240,8 +240,8 @@ int main( int argc, char** argv )
packm_cntl_a =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
mr,
kr,
mr, NULL,
kr, NULL,
TRUE, // scale?
TRUE, // densify?
FALSE, // invert diagonal?
@@ -252,8 +252,8 @@ int main( int argc, char** argv )
packm_cntl_b =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
kr,
nr,
kr, NULL,
nr, NULL,
FALSE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?

View File

@@ -38,12 +38,11 @@
// transa transb m n k alpha a lda b ldb beta c ldc
void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* );
//#define PRINT
#define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t a_pack, b_pack;
obj_t c_save;
obj_t alpha, beta;
dim_t m, n, k;
@@ -54,6 +53,9 @@ int main( int argc, char** argv )
num_t dt_alpha, dt_beta;
int r, n_repeats;
#if 0
obj_t a_pack, b_pack;
blksz_t* mr;
blksz_t* nr;
blksz_t* kr;
@@ -70,6 +72,7 @@ int main( int argc, char** argv )
gemm_t* gemm_cntl_op_bp;
gemm_t* gemm_cntl_mm_op;
gemm_t* gemm_cntl_vl_mm;
#endif
double dtime;
double dtime_save;
@@ -132,6 +135,7 @@ int main( int argc, char** argv )
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( -(1.0/1.0), 0.0, &beta );
#if 0
mr = bli_blksz_obj_create( 2, 4, 2, 2 );
kr = bli_blksz_obj_create( 1, 1, 1, 1 );
nr = bli_blksz_obj_create( 1, 4, 1, 1 );
@@ -215,7 +219,7 @@ int main( int argc, char** argv )
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b_pack );
#endif
bli_copym( &c, &c_save );
@@ -291,6 +295,7 @@ int main( int argc, char** argv )
printf( "( %2ld, 1:5 ) = [ %4lu %4lu %4lu %10.3e %6.3f ];\n",
(p - p_begin + 1)/p_inc + 1, m, k, n, dtime_save, gflops );
#if 0
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b_pack );
@@ -309,6 +314,7 @@ int main( int argc, char** argv )
bli_cntl_obj_free( gemm_cntl_op_bp );
bli_cntl_obj_free( gemm_cntl_mm_op );
bli_cntl_obj_free( gemm_cntl_vl_mm );
#endif
bli_obj_free( &alpha );
bli_obj_free( &beta );

View File

@@ -149,8 +149,8 @@ int main( int argc, char** argv )
packm_cntl_a =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
mr,
kr,
mr, NULL,
kr, NULL,
FALSE, // scale?
TRUE, // densify?
FALSE, // invert diagonal?
@@ -162,8 +162,8 @@ int main( int argc, char** argv )
packm_cntl_b =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
kr,
nr,
kr, NULL,
nr, NULL,
FALSE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?

View File

@@ -146,8 +146,8 @@ int main( int argc, char** argv )
packm_cntl_a =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
mr,
kr,
mr, NULL,
kr, NULL,
FALSE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?
@@ -159,8 +159,8 @@ int main( int argc, char** argv )
packm_cntl_b =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
kr,
nr,
kr, NULL,
nr, NULL,
FALSE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?

View File

@@ -142,8 +142,8 @@ int main( int argc, char** argv )
packm_cntl_a =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
mr,
kr,
mr, NULL,
kr, NULL,
FALSE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?
@@ -155,8 +155,8 @@ int main( int argc, char** argv )
packm_cntl_b =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
kr,
nr,
kr, NULL,
nr, NULL,
FALSE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?

View File

@@ -162,8 +162,8 @@ int main( int argc, char** argv )
packm_cntl_a =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
mr, // IMPORTANT: for consistency with trsm, "k" dim
mr, // multiple is set to mr.
mr, NULL, // IMPORTANT: for consistency with trsm, "k" dim
mr, NULL, // multiple is set to mr.
FALSE, // scale?
TRUE, // densify?
FALSE, // invert diagonal?
@@ -175,8 +175,8 @@ int main( int argc, char** argv )
packm_cntl_b =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
mr, // IMPORTANT: m dim multiple here must be mr
nr, // since "k" dim multiple is set to mr above.
mr, NULL, // IMPORTANT: m dim multiple here must be mr
nr, NULL, // since "k" dim multiple is set to mr above.
FALSE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?

View File

@@ -151,8 +151,8 @@ int main( int argc, char** argv )
packm_cntl_a =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
mr, // IMPORTANT: "k" dim multiple must be mr to
mr, // support using ukernel for right/bottom-right
mr, NULL, // IMPORTANT: "k" dim multiple must be mr to
mr, NULL, // support using ukernel for right/bottom-right
// edge cases (see macro-kernel for comments).
FALSE, // scale?
TRUE, // densify?
@@ -165,8 +165,8 @@ int main( int argc, char** argv )
packm_cntl_b =
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
mr, // IMPORTANT: m dim multiple here must be mr
nr, // since "k" dim multiple is set to mr above.
mr, NULL, // IMPORTANT: m dim multiple here must be mr
nr, NULL, // since "k" dim multiple is set to mr above.
TRUE, // scale?
FALSE, // densify?
FALSE, // invert diagonal?

View File

@@ -490,6 +490,23 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
BLIS_DEFAULT_NC_C,
BLIS_DEFAULT_NC_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-3 cache blksz exts s d c z \n" );
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
BLIS_EXTEND_MC_S,
BLIS_EXTEND_MC_D,
BLIS_EXTEND_MC_C,
BLIS_EXTEND_MC_Z );
libblis_test_fprintf_c( os, " k dimension %5u %5u %5u %5u\n",
BLIS_EXTEND_KC_S,
BLIS_EXTEND_KC_D,
BLIS_EXTEND_KC_C,
BLIS_EXTEND_KC_Z );
libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n",
BLIS_EXTEND_NC_S,
BLIS_EXTEND_NC_D,
BLIS_EXTEND_NC_C,
BLIS_EXTEND_NC_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-3 register blocksizes \n" );
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
BLIS_DEFAULT_MR_S,
@@ -502,6 +519,18 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
BLIS_DEFAULT_NR_C,
BLIS_DEFAULT_NR_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-3 register blksz exts s d c z \n" );
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
BLIS_EXTEND_MR_S,
BLIS_EXTEND_MR_D,
BLIS_EXTEND_MR_C,
BLIS_EXTEND_MR_Z );
libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n",
BLIS_EXTEND_NR_S,
BLIS_EXTEND_NR_D,
BLIS_EXTEND_NR_C,
BLIS_EXTEND_NR_Z );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "level-3 packing duplication \n" );
libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n",
BLIS_DEFAULT_NUM_DUPL_S,