Added support for pre-broadcast when packing B.

Details:
- Added support for being able to duplicate (broadcast) elements in
  memory when packing matrix B (ie: the left-hand operand) in level-3
  operations. This turns out advantageous for some architectures that
  can afford the cost of the extra bandwidth and somehow benefit from
  the pre-broadcast elements (and thus being able to avoid using
  broadcast-style load instructions on micro-rows of B in the gemm
  microkernel).
- Support optionally disabling right-side hemm and symm. If this occurs,
  hemm_r is implemented in terms of hemm_l (and symm_r in terms of
  symm_l). This is needed when broadcasting during packing because the
  alternative--supporting the broadcast of B while also allowing matrix
  B to be Hermitian/symmetric--would be an absolute mess.
- Support alignment factors for packed blocks of A, B, and C separately
  (as well as for general-purpose buffers). In addition, we support
  byte offsets from those alignment values (which is different from
  aligning by align+offset bytes to begin with). The default alignment
  values are BLIS_PAGE_SIZE in all four cases, with the offset values
  defaulting to zero.
- Pass pack_t schema into bli_?packm_cxk() so that it can be then passed
  into the packm kernel, where it will be needed by packm kernels that
  perform broadcasts of B, since the idea is that we *only* want to
  broadcast when packing micropanels of B and not A.
- Added definition for variadic bli_cntx_set_l3_vir_ukrs(), which can be
  used to set custom virtual level-3 microkernels in the cntx_t, which
  would typically be done in the bli_cntx_init_*() function defined in
  the subconfiguration of interest.
- Added a "broadcast B" kernel function for use with NP/NR = 12/6,
  defined in in ref_kernels/1m/bli_packm_cxk_bb_ref.c.
- Added a gemm, gemmtrsm, and trsm "broadcast B" reference kernels
  defined in ref_kernels/3/bb. (These kernels have been tested with
  double real with NP/NR = 12/6.)
- Added #ifndef ... #endif guards around several macro constants defined
  in frame/include/bli_kernel_macro_defs.h.
- Defined a few "broadcast B" static functions in
  frame/include/level0/bb for use by "broadcast B"-style packm reference
  kernels. For now, only the real domain kernels are tested and fully
  defined.
- Output the alignment and offset values for packed blocks of A and B
  in the testsuite's "BLIS configuration info" section.
- Comment updates to various files.
- Bumped so_version to 3.0.0.
This commit is contained in:
Field G. Van Zee
2019-09-17 17:42:10 -05:00
parent fd9bf497cd
commit 31c8657f1d
36 changed files with 2237 additions and 122 deletions

View File

@@ -0,0 +1,239 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Instantiate prototypes for packm kernels.
PACKM_KER_PROT( double, d, packm_6xk_bb2_haswell_ref )
// Instantiate prototypes for level-3 kernels.
GEMM_UKR_PROT( double, d, gemmbb_haswell_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_haswell_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_haswell_ref )
TRSM_UKR_PROT( double, d, trsmbb_l_haswell_ref )
TRSM_UKR_PROT( double, d, trsmbb_u_haswell_ref )
void bli_cntx_init_haswell( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_haswell_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
#if 0
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
#else
3,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_haswell_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_haswell_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_haswell_ref, FALSE,
#endif
cntx
);
// Update the context with customized virtual [gemm]trsm micro-kernels.
bli_cntx_set_l3_vir_ukrs
(
2,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_haswell_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_haswell_ref,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_haswell_ref,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
#if 0
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
-1, 12, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 1, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 1, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 1, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
-1, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -0,0 +1,163 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096
#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 32
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 64
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
// -- sgemm micro-kernel --
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 24
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 6
#define BLIS_DEFAULT_NR_S 16
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 16
#define BLIS_DEFAULT_NR_S 6
#endif
// -- dgemm micro-kernel --
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12
#define BLIS_DEFAULT_MC_D 152
#define BLIS_DEFAULT_KC_D 160
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 12
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 6
#endif
// -- cgemm micro-kernel --
#if 1
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8
#define BLIS_DEFAULT_MC_C 144
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4080
#define BLIS_DEFAULT_MR_C 3
#define BLIS_DEFAULT_NR_C 8
#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3
#define BLIS_DEFAULT_MC_C 144
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4080
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 3
#endif
// -- zgemm micro-kernel --
#if 1
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 3
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 4
#define BLIS_DEFAULT_NR_Z 3
#endif
#endif
//#endif

View File

@@ -0,0 +1,98 @@
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := haswell
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
ifeq ($(GCC_OT_4_9_0),yes)
# If gcc is older than 4.9.0, we must use a different label for -march.
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
endif
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xCORE-AVX2
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -86,6 +86,7 @@ INSERT_GENTDEF( packm )
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \

View File

@@ -44,6 +44,7 @@
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \

View File

@@ -40,6 +40,7 @@
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
@@ -73,36 +74,51 @@ void PASTEMAC(ch,opname) \
the outer (panel_dim_max - panel_dim) rows or columns of the
micropanel. (Note that these rows/columns correspond to values
beyond the edge of matrix A.) The kernel intrinsically knows its
own panel_dim_max, since that corresponds to the kernel's register
blocksize. However, we need to pass in panel_len_max because the
bottom-right edge case of trsm_lu will need all elements above the
extended diagonal and beyond (to the right of) the bottom-right
element to be initialized to zero so the trsm portion of the
computational kernel will operate with zeros for those iterations.
own panel_dim_max, since that corresponds to the packm kernel's
leading dimension. However, we *do* need to pass in panel_len_max
because the bottom-right edge case of trsm_lu will need all
elements above the extended diagonal and beyond (to the right of)
the bottom-right element to be initialized to zero so the trsm
portion of the computational kernel will operate with zeros for
those iterations.
As an example, if trsm_lu is executed on a 6x6 matrix, and the
gemmtrsm kernel uses MR = 6, the computation will begin with the
edge case, which is the bottom 2x2 matrix marked with x's. Code
in bli_packm_tri_cxk() will extend the diagonal as identity into
the remaining portion of the micropanel. But before that happens,
the packm kernel must have set the 0's shown below. (Unreferenced
elements are marked with '.'.)
For example, if trsm_lu is executed on an 10x10 triangular matrix,
and the gemmtrsm kernel uses MR = 6, the computation will begin
with the edge case, which is the bottom-right 4x4 upper triangular
matrix. Code in bli_packm_tri_cxk() will extend the diagonal as
identity into the remaining portion of the micropanel. But before
that happens, the packm kernel must have set the 0's added in
step (3) below.
x x 0 0 0 0
. x 0 0 0 0
. . 1 0 0 0
. . . 1 0 0
. . . . 1 0
. . . . . 1
packm kernel packm kernel packm kernel packm_tri_cxk
step 1: step 2: step 3: step 4:
In this case, panel_dim will be 2 because two rows of data are
copied from A, panel_len will be 2 because those two rows span
two columns of A, and panel_len_max will be 6 because there are a
total of 6 columns that can be written to, 4 of which lie beyond
the values copied from A. */ \
x x x x . . x x x x . . x x x x 0 0 x x x x 0 0
? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0
? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0
? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
x Copied from A; valid element.
? Copied from A, but value is unknown and unused.
. Uninitialized.
0 Initialized to zero.
1 Initialized to one.
NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
to zero. This is not needed to support trsm, but rather to
support trmm. (Both use the same packing format and code.)
In this case, panel_dim will be 4 because four rows of data are
copied from A, panel_len will be 4 because those four rows span
four columns of A, and panel_len_max will be 6 because there are a
total of 6 columns that can be written to in the packed micropanel,
2 of which lie beyond the values copied from A. */ \
f \
( \
conja, \
schema, \
panel_dim, \
panel_len, \
panel_len_max, \

View File

@@ -39,6 +39,7 @@
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \

View File

@@ -261,7 +261,7 @@ siz_t bli_packm_init_pack
bli_obj_toggle_uplo( p );
}
// If we are packing micro-panels, mark P as dense. Otherwise, we are
// If we are packing micropanels, mark P as dense. Otherwise, we are
// probably being called in the context of a level-2 operation, in
// which case we do not want to overwrite the uplo field of P (inherited
// from A) with BLIS_DENSE because that information may be needed by
@@ -368,28 +368,28 @@ siz_t bli_packm_init_pack
// default (logical) blocksize multiple in the m dimension.
m_panel = bmult_m_def;
// The "column stride" of a row panel packed object is interpreted as
// the column stride WITHIN a panel. Thus, this is equal to the
// packing (storage) blocksize multiple (which may be equal to the
// default (logical) blocksize multiple.
// The "column stride" of a row-micropanel packed object is interpreted
// as the column stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple, which may be equal to the
// default (logical) blocksize multiple).
cs_p = bmult_m_pack;
// The "row stride" of a row panel packed object is interpreted
// as the row stride WITHIN a panel. Thus, it is unit.
// The "row stride" of a row-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, it is unit.
rs_p = 1;
// The "panel stride" of a panel packed object is interpreted as the
// distance between the (0,0) element of panel k and the (0,0)
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded width computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each panel (ie: the right edge of the matrix). Zero-padding
// can also occur along the long edge of the last panel if the m
// of each micropanel (ie: the right edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the m
// dimension of the matrix is not a whole multiple of MR.
ps_p = cs_p * n_p_pad;
// As a general rule, we don't want panel strides to be odd. This
// As a general rule, we don't want micropanel strides to be odd. This
// is primarily motivated by our desire to support interleaved 3m
// micro-panels, in which case we have to scale the panel stride
// micropanels, in which case we have to scale the panel stride
// by 3/2. That division by 2 means the numerator (prior to being
// scaled by 3) must be even.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
@@ -399,7 +399,7 @@ siz_t bli_packm_init_pack
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
// always interpreted as being in units of the datatype of the object
// which is not necessarily how the micro-panels will be stored. For
// which is not necessarily how the micropanels will be stored. For
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
// we halve ps_p. Why? Because the macro-kernel indexes in units of
// the complex datatype. So these changes "trick" it into indexing
@@ -418,11 +418,11 @@ siz_t bli_packm_init_pack
// If it is indeed odd, we nudge it higher.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Despite the fact that the packed micro-panels will contain
// Despite the fact that the packed micropanels will contain
// real elements, the panel stride that we store in the obj_t
// (which is passed into the macro-kernel) needs to be in units
// of complex elements, since the macro-kernel will index through
// micro-panels via complex pointer arithmetic for trmm/trsm.
// micropanels via complex pointer arithmetic for trmm/trsm.
// Since the indexing "increment" will be twice as large as each
// actual stored element, we divide the panel_stride by 2.
ps_p = ps_p / 2;
@@ -431,10 +431,10 @@ siz_t bli_packm_init_pack
// Set the imaginary stride (in units of fundamental elements) for
// 3m and 4m (separated or interleaved). We use ps_p_orig since
// that variable tracks the number of real part elements contained
// within each micro-panel of the source matrix. Therefore, this
// within each micropanel of the source matrix. Therefore, this
// is the number of real elements that must be traversed before
// reaching the imaginary part (3mi/4mi) of the packed micro-panel,
// or the real part of the next micro-panel (3ms).
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
// or the real part of the next micropanel (3ms).
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel );
@@ -461,28 +461,29 @@ siz_t bli_packm_init_pack
// default (logical) blocksize multiple in the n dimension.
n_panel = bmult_n_def;
// The "row stride" of a column panel packed object is interpreted as
// the row stride WITHIN a panel. Thus, this is equal to the
// The "row stride" of a column-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple (which may be equal to the
// default (logical) blocksize multiple.
rs_p = bmult_n_pack;
// The "column stride" of a column panel packed object is interpreted
// as the column stride WITHIN a panel. Thus, it is unit.
// The "column stride" of a column-micropanel packed object is
// interpreted as the column stride WITHIN a micropanel. Thus, it is
// unit.
cs_p = 1;
// The "panel stride" of a panel packed object is interpreted as the
// distance between the (0,0) element of panel k and the (0,0)
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded length computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each panel (ie: the bottom edge of the matrix). Zero-padding
// can also occur along the long edge of the last panel if the n
// of each micropanel (ie: the bottom edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the n
// dimension of the matrix is not a whole multiple of NR.
ps_p = m_p_pad * rs_p;
// As a general rule, we don't want panel strides to be odd. This
// As a general rule, we don't want micropanel strides to be odd. This
// is primarily motivated by our desire to support interleaved 3m
// micro-panels, in which case we have to scale the panel stride
// micropanels, in which case we have to scale the panel stride
// by 3/2. That division by 2 means the numerator (prior to being
// scaled by 3) must be even.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
@@ -492,7 +493,7 @@ siz_t bli_packm_init_pack
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
// always interpreted as being in units of the datatype of the object
// which is not necessarily how the micro-panels will be stored. For
// which is not necessarily how the micropanels will be stored. For
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
// we halve ps_p. Why? Because the macro-kernel indexes in units of
// the complex datatype. So these changes "trick" it into indexing
@@ -511,11 +512,11 @@ siz_t bli_packm_init_pack
// If it is indeed odd, we nudge it higher.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Despite the fact that the packed micro-panels will contain
// Despite the fact that the packed micropanels will contain
// real elements, the panel stride that we store in the obj_t
// (which is passed into the macro-kernel) needs to be in units
// of complex elements, since the macro-kernel will index through
// micro-panels via complex pointer arithmetic for trmm/trsm.
// micropanels via complex pointer arithmetic for trmm/trsm.
// Since the indexing "increment" will be twice as large as each
// actual stored element, we divide the panel_stride by 2.
ps_p = ps_p / 2;
@@ -524,10 +525,10 @@ siz_t bli_packm_init_pack
// Set the imaginary stride (in units of fundamental elements) for
// 3m and 4m (separated or interleaved). We use ps_p_orig since
// that variable tracks the number of real part elements contained
// within each micro-panel of the source matrix. Therefore, this
// within each micropanel of the source matrix. Therefore, this
// is the number of real elements that must be traversed before
// reaching the imaginary part (3mi/4mi) of the packed micro-panel,
// or the real part of the next micro-panel (3ms).
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
// or the real part of the next micropanel (3ms).
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel );

View File

@@ -100,6 +100,7 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
@@ -338,6 +339,7 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
@@ -436,6 +438,7 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,kername) \
( \
conjc10, \
schema, \
p10_dim, \
panel_dim_max, \
p10_len, \
@@ -455,6 +458,7 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,kername) \
( \
conjc12, \
schema, \
p12_dim, \
panel_dim_max, \
p12_len, \
@@ -561,6 +565,7 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \

View File

@@ -108,9 +108,9 @@ void bli_gemm_front
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
// have made a copy and modified the schemas, so reading them from the
// context should be a safe bet at this point.) This is a sort of hack for
// communicating the desired pack schemas for to bli_gemm_cntl_create()
// (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows
// us to subsequently access the schemas from the control tree, which
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
// to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );

View File

@@ -69,10 +69,44 @@ void bli_hemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_DISABLE_HEMM_RIGHT
// NOTE: This case casts right-side hemm/symm in terms of left side. This
// is necessary when the current subconfiguration uses a gemm microkernel
// that assumes that the packing kernel will have already duplicated
// (broadcast) element of B in the packed copy of B. Supporting
// duplication within the logic that packs micropanels from Hermitian/
// symmetric matrices would be ugly, and so we simply don't support it.
// As a consequence, those subconfigurations need a way to force the
// Hermitian/symmetric matrix to be on the left (and thus the general
// matrix to the on the right). So our solution is that in those cases,
// the subconfigurations simply #define BLIS_DISABLE_HEMM_RIGHT.
// If A is being multiplied from the right, transpose all operands
// so that we can perform the computation as if A were being multiplied
// from the left.
if ( bli_is_right( side ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
#else
// NOTE: This case computes right-side hemm/symm natively by packing
// elements of the Hermitian/symmetric matrix A to micropanels of the
// right-hand packed matrix operand "B", and elements of the general
// matrix B to micropanels of the left-hand packed matrix operand "A".
// This code path always gives us the opportunity to transpose the
// entire operation so that the effective storage format of the output
// matrix matches the microkernel's output preference. Thus, from a
// performance perspective, this case is preferred.
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( !bli_obj_is_1x1( &c_local ) )
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_toggle_side( &side );
@@ -81,12 +115,21 @@ void bli_hemm_front
bli_obj_induce_trans( &c_local );
}
// Swap A and B if multiplying A from the right so that "B" contains
// the Hermitian matrix.
// If the Hermitian/symmetric matrix A is being multiplied from the right,
// swap A and B so that the Hermitian/symmetric matrix will actually be on
// the right.
if ( bli_is_right( side ) )
{
bli_obj_swap( &a_local, &b_local );
}
#endif
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
bli_obj_set_as_root( &a_local );
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any

View File

@@ -69,10 +69,44 @@ void bli_symm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_DISABLE_HEMM_RIGHT
// NOTE: This case casts right-side hemm/symm in terms of left side. This
// is necessary when the current subconfiguration uses a gemm microkernel
// that assumes that the packing kernel will have already duplicated
// (broadcast) element of B in the packed copy of B. Supporting
// duplication within the logic that packs micropanels from Hermitian/
// symmetric matrices would be ugly, and so we simply don't support it.
// As a consequence, those subconfigurations need a way to force the
// Hermitian/symmetric matrix to be on the left (and thus the general
// matrix to the on the right). So our solution is that in those cases,
// the subconfigurations simply #define BLIS_DISABLE_HEMM_RIGHT.
// If A is being multiplied from the right, transpose all operands
// so that we can perform the computation as if A were being multiplied
// from the left.
if ( bli_is_right( side ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
#else
// NOTE: This case computes right-side hemm/symm natively by packing
// elements of the Hermitian/symmetric matrix A to micropanels of the
// right-hand packed matrix operand "B", and elements of the general
// matrix B to micropanels of the left-hand packed matrix operand "A".
// This code path always gives us the opportunity to transpose the
// entire operation so that the effective storage format of the output
// matrix matches the microkernel's output preference. Thus, from a
// performance perspective, this case is preferred.
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( !bli_obj_is_1x1( &c_local ) )
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_toggle_side( &side );
@@ -80,12 +114,21 @@ void bli_symm_front
bli_obj_induce_trans( &c_local );
}
// Swap A and B if multiplying A from the right so that "B" contains
// the symmetric matrix.
// If the Hermitian/symmetric matrix A is being multiplied from the right,
// swap A and B so that the Hermitian/symmetric matrix will actually be on
// the right.
if ( bli_is_right( side ) )
{
bli_obj_swap( &a_local, &b_local );
}
#endif
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
bli_obj_set_as_root( &a_local );
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any

View File

@@ -389,6 +389,7 @@ pool_t* bli_apool_array_elem
const siz_t num_blocks = 1;
const siz_t block_ptrs_len = 25;
const siz_t align_size = 16;
const siz_t offset_size = 0;
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
free_ft free_fp = BLIS_FREE_POOL;
@@ -425,6 +426,7 @@ pool_t* bli_apool_array_elem
block_ptrs_len,
block_size,
align_size,
offset_size,
malloc_fp,
free_fp,
pool

View File

@@ -654,6 +654,128 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default level-3 virtual microkernels. It should be called after
// bli_cntx_init_defaults() so that the context begins with default
// microkernels across all datatypes.
/* Example prototypes:
void bli_cntx_set_l3_vir_ukrs
(
dim_t n_ukrs,
l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ) );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_ukrs );
// Process n_ukrs tuples.
for ( i = 0; i < n_ukrs; ++i )
{
// Here, we query the variable argument list for:
// - the l3ukr_t of the kernel we're about to process,
// - the datatype of the kernel, and
// - the kernel function pointer.
// that we need to store to the context.
const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t );
const num_t ukr_dt = ( num_t )va_arg( args, num_t );
void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
// Store the values in our temporary arrays.
ukr_ids[ i ] = ukr_id;
ukr_dts[ i ] = ukr_dt;
ukr_fps[ i ] = ukr_fp;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the l3 virtual ukernel func_t array
func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_ukrs; ++i )
{
// Read the current ukernel id, ukernel datatype, ukernel function
// pointer, and ukernel preference.
const l3ukr_t ukr_id = ukr_ids[ i ];
const num_t ukr_dt = ukr_dts[ i ];
void_fp ukr_fp = ukr_fps[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ];
// Store the ukernel function pointer and preference values into
// the context. Notice that we redundantly store the native
// ukernel address in both the native and virtual ukernel slots
// in the context. This is standard practice when creating a
// native context. (Induced method contexts will overwrite the
// virtual function pointer with the address of the appropriate
// virtual ukernel.)
bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
bli_free_intl( ukr_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
bli_free_intl( ukr_dts );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
bli_free_intl( ukr_fps );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... )
{
// This function can be called from the bli_cntx_init_*() function for

View File

@@ -738,6 +738,7 @@ BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... );

View File

@@ -43,25 +43,32 @@
static char* bli_version_str = BLIS_VERSION_STRING;
static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
char* bli_info_get_version_str( void ) { return bli_version_str; }
char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; }
char* bli_info_get_version_str( void ) { return bli_version_str; }
char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; }
// -- General configuration-related --------------------------------------------
gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZE; }
gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; }
gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; }
gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; }
gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_NUM_REGISTERS; }
gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_SIZE; }
gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; }
gint_t bli_info_get_stack_buf_max_size( void ) { return BLIS_STACK_BUF_MAX_SIZE; }
gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; }
gint_t bli_info_get_heap_addr_align_size( void ) { return BLIS_HEAP_ADDR_ALIGN_SIZE; }
gint_t bli_info_get_heap_stride_align_size( void ) { return BLIS_HEAP_STRIDE_ALIGN_SIZE; }
gint_t bli_info_get_pool_addr_align_size( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE; }
gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZE; }
gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; }
gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; }
gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; }
gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_NUM_REGISTERS; }
gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_SIZE; }
gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; }
gint_t bli_info_get_stack_buf_max_size( void ) { return BLIS_STACK_BUF_MAX_SIZE; }
gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; }
gint_t bli_info_get_heap_addr_align_size( void ) { return BLIS_HEAP_ADDR_ALIGN_SIZE; }
gint_t bli_info_get_heap_stride_align_size( void ) { return BLIS_HEAP_STRIDE_ALIGN_SIZE; }
gint_t bli_info_get_pool_addr_align_size_a( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_A; }
gint_t bli_info_get_pool_addr_align_size_b( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_B; }
gint_t bli_info_get_pool_addr_align_size_c( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_C; }
gint_t bli_info_get_pool_addr_align_size_gen( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_GEN; }
gint_t bli_info_get_pool_addr_offset_size_a( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_A; }
gint_t bli_info_get_pool_addr_offset_size_b( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_B; }
gint_t bli_info_get_pool_addr_offset_size_c( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_C; }
gint_t bli_info_get_pool_addr_offset_size_gen( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_GEN; }
gint_t bli_info_get_enable_stay_auto_init( void )
{
#ifdef BLIS_ENABLE_STAY_AUTO_INITIALIZED

View File

@@ -53,7 +53,14 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void );

View File

@@ -52,11 +52,12 @@ void bli_membrk_init
{
membrk_t* restrict membrk = bli_membrk_query();
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN;
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
free_ft free_fp = BLIS_FREE_POOL;
// These fields are used for general-purpose allocation.
// These fields are used for general-purpose allocation (ie: buf_type
// equal to BLIS_BUFFER_FOR_GEN_USE) within bli_membrk_acquire_m().
bli_membrk_set_align_size( align_size, membrk );
bli_membrk_set_malloc_fp( malloc_fp, membrk );
bli_membrk_set_free_fp( free_fp, membrk );
@@ -348,8 +349,15 @@ void bli_membrk_init_pools
const dim_t block_ptrs_len_b = 80;
const dim_t block_ptrs_len_c = 0;
// Use the address alignment size designated (at configure-time) for pools.
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
// Use the address alignment sizes designated (at configure-time) for pools.
const siz_t align_size_a = BLIS_POOL_ADDR_ALIGN_SIZE_A;
const siz_t align_size_b = BLIS_POOL_ADDR_ALIGN_SIZE_B;
const siz_t align_size_c = BLIS_POOL_ADDR_ALIGN_SIZE_C;
// Use the offsets from the above alignments.
const siz_t offset_size_a = BLIS_POOL_ADDR_OFFSET_SIZE_A;
const siz_t offset_size_b = BLIS_POOL_ADDR_OFFSET_SIZE_B;
const siz_t offset_size_c = BLIS_POOL_ADDR_OFFSET_SIZE_C;
// Use the malloc() and free() designated (at configure-time) for pools.
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
@@ -362,12 +370,12 @@ void bli_membrk_init_pools
cntx );
// Initialize the memory pools for A, B, and C.
bli_pool_init( num_blocks_a, block_ptrs_len_a, block_size_a, align_size,
malloc_fp, free_fp, pool_a );
bli_pool_init( num_blocks_b, block_ptrs_len_b, block_size_b, align_size,
malloc_fp, free_fp, pool_b );
bli_pool_init( num_blocks_c, block_ptrs_len_c, block_size_c, align_size,
malloc_fp, free_fp, pool_c );
bli_pool_init( num_blocks_a, block_ptrs_len_a, block_size_a, align_size_a,
offset_size_a, malloc_fp, free_fp, pool_a );
bli_pool_init( num_blocks_b, block_ptrs_len_b, block_size_b, align_size_b,
offset_size_b, malloc_fp, free_fp, pool_b );
bli_pool_init( num_blocks_c, block_ptrs_len_c, block_size_c, align_size_c,
offset_size_c, malloc_fp, free_fp, pool_c );
}
void bli_membrk_finalize_pools

View File

@@ -43,6 +43,7 @@ void bli_pool_init
siz_t block_ptrs_len,
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
free_ft free_fp,
pool_t* restrict pool
@@ -67,8 +68,8 @@ void bli_pool_init
for ( dim_t i = 0; i < num_blocks; ++i )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_init(): allocating block %d of size %d (align %d).\n",
( int )i, ( int )block_size, ( int )align_size );
printf( "bli_pool_init(): allocating block %d of size %d (align %d, offset %d).\n",
( int )i, ( int )block_size, ( int )align_size, ( int )offset_size );
fflush( stdout );
#endif
@@ -76,6 +77,7 @@ void bli_pool_init
(
block_size,
align_size,
offset_size,
malloc_fp,
&(block_ptrs[i])
);
@@ -99,6 +101,7 @@ void bli_pool_init
bli_pool_set_num_blocks( num_blocks, pool );
bli_pool_set_block_size( block_size, pool );
bli_pool_set_align_size( align_size, pool );
bli_pool_set_offset_size( offset_size, pool );
bli_pool_set_malloc_fp( malloc_fp, pool );
bli_pool_set_free_fp( free_fp, pool );
}
@@ -135,12 +138,16 @@ void bli_pool_finalize
free_ft free_fp = bli_pool_free_fp( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_finalize(): freeing %d blocks of size %d (align %d).\n",
printf( "bli_pool_finalize(): freeing %d blocks of size %d (align %d, offset %d).\n",
( int )num_blocks, ( int )bli_pool_block_size( pool ),
( int )bli_pool_align_size( pool ) );
( int )bli_pool_align_size( pool ),
( int )bli_pool_offset_size( pool ) );
fflush( stdout );
#endif
// Query the offset size of the pool.
const siz_t offset_size = bli_pool_offset_size( pool );
// Free the individual blocks currently in the pool.
for ( dim_t i = 0; i < num_blocks; ++i )
{
@@ -148,7 +155,7 @@ void bli_pool_finalize
printf( "bli_pool_finalize(): block %d: ", ( int )i );
#endif
bli_pool_free_block( free_fp, &(block_ptrs[i]) );
bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) );
}
#ifdef BLIS_ENABLE_MEM_TRACING
@@ -169,6 +176,7 @@ void bli_pool_finalize
bli_pool_set_top_index( 0, pool );
bli_pool_set_block_size( 0, pool );
bli_pool_set_align_size( 0, pool );
bli_pool_set_offset_size( 0, pool );
#endif
}
@@ -178,6 +186,7 @@ void bli_pool_reinit
siz_t block_ptrs_len_new,
siz_t block_size_new,
siz_t align_size_new,
siz_t offset_size_new,
pool_t* restrict pool
)
{
@@ -202,6 +211,7 @@ void bli_pool_reinit
block_ptrs_len_new,
block_size_new,
align_size_new,
offset_size_new,
malloc_fp,
free_fp,
pool
@@ -223,6 +233,7 @@ void bli_pool_checkout_block
const siz_t num_blocks_new = bli_pool_num_blocks( pool );
const siz_t block_ptrs_len_new = bli_pool_block_ptrs_len( pool );
const siz_t align_size_new = bli_pool_align_size( pool );
const siz_t offset_size_new = bli_pool_offset_size( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_checkout_block(): old block size %d < req size %d; "
@@ -237,6 +248,7 @@ void bli_pool_checkout_block
block_ptrs_len_new,
req_size,
align_size_new,
offset_size_new,
pool
);
}
@@ -293,10 +305,13 @@ void bli_pool_checkin_block
// has since been reinitialized to a different (larger) block size.
if ( bli_pblk_block_size( block ) != bli_pool_block_size( pool ) )
{
// Query the offset size of the pool.
const siz_t offset_size = bli_pool_offset_size( pool );
// Query the free() function pointer for the pool.
free_ft free_fp = bli_pool_free_fp( pool );
bli_pool_free_block( free_fp, block );
bli_pool_free_block( offset_size, free_fp, block );
return;
}
@@ -308,9 +323,10 @@ void bli_pool_checkin_block
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_checkin_block(): checking in block %d of size %d "
"(align %d).\n",
"(align %d, offset %d).\n",
( int )top_index - 1, ( int )bli_pool_block_size( pool ),
( int )bli_pool_align_size( pool ) );
( int )bli_pool_align_size( pool ),
( int )bli_pool_offset_size( pool ) );
fflush( stdout );
#endif
@@ -396,8 +412,9 @@ void bli_pool_grow
pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the block size and alignment size of the pool.
const siz_t block_size = bli_pool_block_size( pool );
const siz_t align_size = bli_pool_align_size( pool );
const siz_t block_size = bli_pool_block_size( pool );
const siz_t align_size = bli_pool_align_size( pool );
const siz_t offset_size = bli_pool_offset_size( pool );
// Query the malloc() function pointer for the pool.
malloc_ft malloc_fp = bli_pool_malloc_fp( pool );
@@ -415,6 +432,7 @@ void bli_pool_grow
(
block_size,
align_size,
offset_size,
malloc_fp,
&(block_ptrs[i])
);
@@ -456,13 +474,16 @@ void bli_pool_shrink
// Compute the new total number of blocks.
const siz_t num_blocks_new = num_blocks - num_blocks_sub;
// Query the offset size of the pool.
const siz_t offset_size = bli_pool_offset_size( pool );
// Query the free() function pointer for the pool.
free_ft free_fp = bli_pool_free_fp( pool );
// Free the individual blocks.
for ( dim_t i = num_blocks_new; i < num_blocks; ++i )
{
bli_pool_free_block( free_fp, &(block_ptrs[i]) );
bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) );
}
// Update the pool_t struct.
@@ -477,22 +498,25 @@ void bli_pool_alloc_block
(
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
pblk_t* restrict block
)
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d)\n",
( int )block_size, ( int )align_size );
printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d, offset %d)\n",
( int )block_size, ( int )align_size, ( int )offset_size );
fflush( stdout );
#endif
// Allocate the block via the bli_fmalloc_align() wrapper, which performs
// alignment logic and opaquely saves the original pointer so that it can
// be recovered when it's time to free the block.
// be recovered when it's time to free the block. Note that we have to
// add offset_size to the number of bytes requested since we will skip
// that many bytes at the beginning of the allocated memory.
void* restrict buf
=
bli_fmalloc_align( malloc_fp, block_size, align_size );
bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size );
#if 0
// NOTE: This code is disabled because it is not needed, since
@@ -517,6 +541,9 @@ void bli_pool_alloc_block
}
#endif
// Advance the pointer by offset_size bytes.
buf = ( void* )( ( char* )buf + offset_size );
// Save the results in the pblk_t structure.
bli_pblk_set_buf( buf, block );
bli_pblk_set_block_size( block_size, block );
@@ -524,6 +551,7 @@ void bli_pool_alloc_block
void bli_pool_free_block
(
siz_t offset_size,
free_ft free_fp,
pblk_t* restrict block
)
@@ -538,6 +566,10 @@ void bli_pool_free_block
// bli_fmalloc_align() when the block was allocated.
void* restrict buf = bli_pblk_buf( block );
// Undo the pointer advancement by offset_size bytes performed previously
// by bli_pool_alloc_block().
buf = ( void* )( ( char* )buf - offset_size );
// Free the block via the bli_ffree_align() wrapper, which recovers the
// original pointer that was returned by the pool's malloc() function when
// the block was allocated.
@@ -555,7 +587,7 @@ void bli_pool_print
siz_t num_blocks = bli_pool_num_blocks( pool );
siz_t block_size = bli_pool_block_size( pool );
siz_t align_size = bli_pool_align_size( pool );
dim_t i;
siz_t offset_size = bli_pool_offset_size( pool );
printf( "pool struct ---------------\n" );
printf( " block_ptrs: %p\n", block_ptrs );
@@ -564,8 +596,10 @@ void bli_pool_print
printf( " num_blocks: %d\n", ( int )num_blocks );
printf( " block_size: %d\n", ( int )block_size );
printf( " align_size: %d\n", ( int )align_size );
printf( " offset_size: %d\n", ( int )offset_size );
printf( " pblks sys align\n" );
for ( i = 0; i < num_blocks; ++i )
for ( dim_t i = 0; i < num_blocks; ++i )
{
printf( " %d: %p\n", ( int )i, bli_pblk_buf( &block_ptrs[i] ) );
}

View File

@@ -126,6 +126,11 @@ static siz_t bli_pool_align_size( pool_t* pool )
return pool->align_size;
}
static siz_t bli_pool_offset_size( pool_t* pool )
{
return pool->offset_size;
}
static malloc_ft bli_pool_malloc_fp( pool_t* pool )
{
return pool->malloc_fp;
@@ -174,6 +179,11 @@ static void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \
pool->align_size = align_size;
}
static void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \
{
pool->offset_size = offset_size;
}
static void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \
{
pool->malloc_fp = malloc_fp;
@@ -197,6 +207,7 @@ void bli_pool_init
siz_t block_ptrs_len,
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
free_ft free_fp,
pool_t* restrict pool
@@ -211,6 +222,7 @@ void bli_pool_reinit
siz_t block_ptrs_len_new,
siz_t block_size_new,
siz_t align_size_new,
siz_t offset_size_new,
pool_t* restrict pool
);
@@ -241,11 +253,13 @@ void bli_pool_alloc_block
(
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
pblk_t* restrict block
);
void bli_pool_free_block
(
siz_t offset_size,
free_ft free_fp,
pblk_t* restrict block
);

View File

@@ -225,6 +225,24 @@ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varn
// -- Basic one-operand macro with real domain only --
// -- (no auxiliary arguments) --
#define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \
\
GENTFUNCRO( float, s, tfuncname ) \
GENTFUNCRO( double, d, tfuncname ) \
// -- (one auxiliary argument) --
#define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \
\
GENTFUNCRO( float, s, tfuncname, varname ) \
GENTFUNCRO( double, d, tfuncname, varname ) \
// -- Basic one-operand macro with complex domain only and real projection --
// -- (no auxiliary arguments) --

View File

@@ -165,19 +165,56 @@
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#ifndef BLIS_STACK_BUF_ALIGN_SIZE
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
#endif
// Alignment size used when allocating memory via BLIS_MALLOC_USER.
// To disable heap alignment, set this to 1.
#ifndef BLIS_HEAP_ADDR_ALIGN_SIZE
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
#endif
// Alignment size used when sizing leading dimensions of memory allocated
// via BLIS_MALLOC_USER.
#ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
#endif
// Alignment size used when allocating blocks to the internal memory
// Alignment sizes used when allocating blocks to the internal memory
// pool, via BLIS_MALLOC_POOL.
#define BLIS_POOL_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A
#define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE
#endif
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B
#define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE
#endif
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C
#define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE
#endif
#ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN
#define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE
#endif
// Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*.
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 0
#endif
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 0
#endif
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C
#define BLIS_POOL_ADDR_OFFSET_SIZE_C 0
#endif
#ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN
#define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0
#endif

View File

@@ -195,9 +195,16 @@
#include "bli_adds_mxn_uplo.h"
#include "bli_set0s_mxn.h"
#include "bli_copys_mxn.h"
#include "bli_scal2s_mxn.h"
#include "bli_xpbys_mxn.h"
#include "bli_xpbys_mxn_uplo.h"
// -- "broadcast B" scalar macros --
#include "bli_bcastbbs_mxn.h"
#include "bli_scal2bbs_mxn.h"
#include "bli_set0bbs_mxn.h"
// -- 3m-specific scalar macros --

View File

@@ -1049,6 +1049,7 @@ typedef struct
siz_t block_size;
siz_t align_size;
siz_t offset_size;
malloc_ft malloc_fp;
free_ft free_fp;

View File

@@ -0,0 +1,74 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_BCASTBBS_MXN_H
#define BLIS_BCASTBBS_MXN_H
// bcastbbs_mxn
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
static void PASTEMAC(ch,opname) \
( \
const dim_t m, \
const dim_t n, \
ctype* restrict y, const inc_t incy, const inc_t ldy \
) \
{ \
/* Assume that the duplication factor is the column stride of y. */ \
const dim_t d = ldy; \
const dim_t ds_y = 1; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict yi = y + i*incy; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict yij = yi + j*ldy; \
\
for ( dim_t p = 1; p < d; ++p ) \
{ \
ctype* restrict yijd = yij + p*ds_y; \
\
PASTEMAC(ch,copys)( *yij, *yijd ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( bcastbbs_mxn )
#endif

View File

@@ -0,0 +1,204 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL2BBS_MXN_H
#define BLIS_SCAL2BBS_MXN_H
// scal2bbs_mxn
#undef GENTFUNCRO
#define GENTFUNCRO( ctype, ch, opname ) \
\
static void PASTEMAC(ch,opname) \
( \
const conj_t conjx, \
const dim_t m, \
const dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, const inc_t incx, const inc_t ldx, \
ctype* restrict y, const inc_t incy, const inc_t ldy \
) \
{ \
/* Assume that the duplication factor is the row stride of y. */ \
const dim_t d = incy; \
const dim_t ds_y = 1; \
\
if ( bli_is_conj( conjx ) ) \
{ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict xj = x + j*ldx; \
ctype* restrict yj = y + j*ldy; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict xij = xj + i*incx; \
ctype* restrict yij = yj + i*incy; \
\
PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \
\
for ( dim_t p = 1; p < d; ++p ) \
{ \
ctype* restrict yijd = yij + p*ds_y; \
\
PASTEMAC(ch,copys)( *yij, *yijd ); \
} \
} \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict xj = x + j*ldx; \
ctype* restrict yj = y + j*ldy; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict xij = xj + i*incx; \
ctype* restrict yij = yj + i*incy; \
\
PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \
\
for ( dim_t p = 1; p < d; ++p ) \
{ \
ctype* restrict yijd = yij + p*ds_y; \
\
PASTEMAC(ch,copys)( *yij, *yijd ); \
} \
} \
} \
} \
}
INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \
\
static void PASTEMAC(ch,opname) \
( \
const conj_t conjx, \
const dim_t m, \
const dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, const inc_t incx, const inc_t ldx, \
ctype* restrict y, const inc_t incy, const inc_t ldy \
) \
{ \
/* Assume that the duplication factor is the row stride of y. */ \
const dim_t d = incy; \
const dim_t ds_y = 1; \
\
const inc_t incx2 = 2 * incx; \
const inc_t ldx2 = 2 * ldx; \
\
const inc_t incy2 = 2 * incy; \
const inc_t ldy2 = 2 * ldy; \
\
ctype_r* restrict alpha_r = ( ctype_r* )alpha; \
ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \
ctype_r* restrict chi_r = ( ctype_r* )x; \
ctype_r* restrict chi_i = ( ctype_r* )x + 1; \
ctype_r* restrict psi_r = ( ctype_r* )y; \
ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \
\
if ( bli_is_conj( conjx ) ) \
{ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype_r* restrict chij_r = chi_r + j*ldx2; \
ctype_r* restrict chij_i = chi_i + j*ldx2; \
ctype_r* restrict psij_r = psi_r + j*ldy2; \
ctype_r* restrict psij_i = psi_i + j*ldy2; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype_r* restrict chiij_r = chij_r + i*incx2; \
ctype_r* restrict chiij_i = chij_i + i*incx2; \
ctype_r* restrict psiij_r = psij_r + i*incy2; \
ctype_r* restrict psiij_i = psij_i + i*incy2; \
\
PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \
*chiij_r, *chiij_i, \
*psiij_r, *psiij_i ); \
\
for ( dim_t p = 1; i < d; ++p ) \
{ \
ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \
ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \
\
PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \
*psiijd_r, *psiijd_i ); \
} \
} \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype_r* restrict chij_r = chi_r + j*ldx2; \
ctype_r* restrict chij_i = chi_i + j*ldx2; \
ctype_r* restrict psij_r = psi_r + j*ldy2; \
ctype_r* restrict psij_i = psi_i + j*ldy2; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype_r* restrict chiij_r = chij_r + i*incx2; \
ctype_r* restrict chiij_i = chij_i + i*incx2; \
ctype_r* restrict psiij_r = psij_r + i*incy2; \
ctype_r* restrict psiij_i = psij_i + i*incy2; \
\
PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \
*chiij_r, *chiij_i, \
*psiij_r, *psiij_i ); \
\
for ( dim_t p = 1; i < d; ++p ) \
{ \
ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \
ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \
\
PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \
*psiijd_r, *psiijd_i ); \
} \
} \
} \
} \
}
INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn )
#endif

View File

@@ -0,0 +1,74 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SET0BBS_MXN_H
#define BLIS_SET0BBS_MXN_H
// set0bbs_mxn
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
static void PASTEMAC(ch,opname) \
( \
const dim_t m, \
const dim_t n, \
ctype* restrict y, const inc_t incy, const inc_t ldy \
) \
{ \
/* Assume that the duplication factor is the row stride of y. */ \
const dim_t d = incy; \
const dim_t ds_y = 1; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict yj = y + j*ldy; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict yij = yj + i*incy; \
\
for ( dim_t p = 0; p < d; ++p ) \
{ \
ctype* restrict yijd = yij + p*ds_y; \
\
PASTEMAC(ch,set0s)( *yijd ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( set0bbs_mxn )
#endif

View File

@@ -0,0 +1,89 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SCAL2S_MXN_H
#define BLIS_SCAL2S_MXN_H
// scal2s_mxn
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
static void PASTEMAC(ch,opname) \
( \
const conj_t conjx, \
const dim_t m, \
const dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \
ctype* restrict y, const inc_t rs_y, const inc_t cs_y \
) \
{ \
if ( bli_is_conj( conjx ) ) \
{ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict xj = x + j*cs_x; \
ctype* restrict yj = y + j*cs_y; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict xij = xj + i*rs_x; \
ctype* restrict yij = yj + i*rs_y; \
\
PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \
} \
} \
} \
else /* if ( bli_is_noconj( conjx ) ) */ \
{ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict xj = x + j*cs_x; \
ctype* restrict yj = y + j*cs_y; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict xij = xj + i*rs_x; \
ctype* restrict yij = yj + i*rs_y; \
\
PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( scal2s_mxn )
#endif

View File

@@ -0,0 +1,318 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
void* restrict kappa, \
void* restrict a, inc_t inca, inc_t lda, \
void* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict alpha1 = a; \
ctype* restrict pi1 = p; \
\
/* Handle the packing of B (column panel schemas) separately from packing
of A (row panel schemas). */ \
if ( bli_is_col_packed( schema ) ) \
{ \
if ( cdim == mnr ) \
{ \
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 2) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 3) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 4) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 5) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 6) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 7) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 8) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 9) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
else /* if ( bli_is_noconj( conja ) ) */ \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 2) ); \
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 3) ); \
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 4) ); \
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 5) ); \
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 6) ); \
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 7) ); \
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 8) ); \
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 9) ); \
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
} \
else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
else /* if ( bli_is_noconj( conja ) ) */ \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
} \
} \
else /* if ( cdim < mnr ) */ \
{ \
PASTEMAC(ch,scal2bbs_mxn) \
( \
conja, \
cdim, \
n, \
kappa, \
a, inca, lda, \
p, 2, ldp \
); \
\
/* if ( cdim < mnr ) */ \
{ \
const dim_t i = cdim; \
const dim_t m_edge = mnr - cdim; \
const dim_t n_edge = n_max; \
ctype* restrict p_cast = p; \
ctype* restrict p_edge = p_cast + (i )*2; \
\
PASTEMAC(ch,set0bbs_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 2, ldp \
); \
} \
} \
\
if ( n < n_max ) \
{ \
const dim_t j = n; \
const dim_t m_edge = mnr; \
const dim_t n_edge = n_max - n; \
ctype* restrict p_cast = p; \
ctype* restrict p_edge = p_cast + (j )*ldp; \
\
PASTEMAC(ch,set0bbs_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 2, ldp \
); \
} \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
if ( cdim == mnr ) \
{ \
if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
else /* if ( bli_is_noconj( conja ) ) */ \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
} \
else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
else /* if ( bli_is_noconj( conja ) ) */ \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
\
alpha1 += lda; \
pi1 += ldp; \
} \
} \
} \
} \
else /* if ( cdim < mnr ) */ \
{ \
PASTEMAC(ch,scal2s_mxn) \
( \
conja, \
cdim, \
n, \
kappa, \
a, inca, lda, \
p, 1, ldp \
); \
\
/* if ( cdim < mnr ) */ \
{ \
const dim_t i = cdim; \
const dim_t m_edge = mnr - cdim; \
const dim_t n_edge = n_max; \
ctype* restrict p_cast = p; \
ctype* restrict p_edge = p_cast + (i )*1; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
} \
\
if ( n < n_max ) \
{ \
const dim_t j = n; \
const dim_t m_edge = mnr; \
const dim_t n_edge = n_max - n; \
ctype* restrict p_cast = p; \
ctype* restrict p_edge = p_cast + (j )*ldp; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )

View File

@@ -40,6 +40,7 @@
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -188,6 +189,7 @@ INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -344,6 +346,7 @@ INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -498,6 +501,7 @@ INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -641,6 +645,7 @@ INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -819,6 +824,7 @@ INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -978,6 +984,7 @@ INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -1145,6 +1152,7 @@ INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -1320,6 +1328,7 @@ INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
@@ -1503,6 +1512,7 @@ INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \

View File

@@ -0,0 +1,142 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// An implementation that indexes through B with the assumption that all
// elements were broadcast (duplicated) by a factor of NP/NR.
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
\
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
\
const dim_t m = mr; \
const dim_t n = nr; \
\
const inc_t cs_a = packmr; \
\
const inc_t rs_b = packnr; \
\
/* Assume that the degree of duplication is equal to packnr / nr. */ \
const inc_t cs_b = packnr / nr; \
\
ctype ab[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ab = 1; \
const inc_t cs_ab = mr; \
\
dim_t l, j, i; \
\
ctype ai; \
ctype bj; \
\
\
/* Initialize the accumulator elements in ab to zero. */ \
for ( i = 0; i < m * n; ++i ) \
{ \
PASTEMAC(ch,set0s)( *(ab + i) ); \
} \
\
/* Perform a series of k rank-1 updates into ab. */ \
for ( l = 0; l < k; ++l ) \
{ \
ctype* restrict abij = ab; \
\
/* In an optimized implementation, these two loops over MR and NR
are typically fully unrolled. */ \
for ( j = 0; j < n; ++j ) \
{ \
bj = *(b + j*cs_b); \
\
for ( i = 0; i < m; ++i ) \
{ \
ai = *(a + i); \
\
PASTEMAC(ch,dots)( ai, bj, *abij ); \
\
abij += rs_ab; \
} \
} \
\
a += cs_a; \
b += rs_b; \
} \
\
/* Scale the result in ab by alpha. */ \
for ( i = 0; i < m * n; ++i ) \
{ \
PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
} \
\
/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
scale by beta and then add the scaled redult in ab. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,copys_mxn)( m, \
n, \
ab, rs_ab, cs_ab, \
c, rs_c, cs_c ); \
} \
else \
{ \
PASTEMAC(ch,xpbys_mxn)( m, \
n, \
ab, rs_ab, cs_ab, \
beta, \
c, rs_c, cs_c ); \
} \
}
INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )

View File

@@ -0,0 +1,138 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// An implementation that indexes through B with the assumption that all
// elements were broadcast (duplicated) by a factor of NP/NR.
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a1x, \
ctype* restrict a11, \
ctype* restrict bx1, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
\
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
\
const inc_t rs_b = packnr; \
\
/* Assume that the degree of duplication is equal to packnr / nr. */ \
const inc_t cs_b = packnr / nr; \
/*
printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \
printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
*/ \
\
ctype* minus_one = PASTEMAC(ch,m1); \
\
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
PASTECH(ch,trsm_ukr_ft) \
trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
\
/*
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
(double*)bx1, rs_b, cs_b, "%5.2f", "" ); \
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
(double*)b11, rs_b, 1, "%5.2f", "" ); \
*/ \
\
/* lower: b11 = alpha * b11 - a10 * b01; */ \
/* upper: b11 = alpha * b11 - a12 * b21; */ \
gemm_ukr \
( \
k, \
minus_one, \
a1x, \
bx1, \
alpha, \
b11, rs_b, cs_b, \
data, \
cntx \
); \
/*
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
(double*)b11, rs_b, 1, "%5.2f", "" ); \
*/ \
\
/* b11 = inv(a11) * b11;
c11 = b11; */ \
trsm_ukr \
( \
a11, \
b11, \
c11, rs_c, cs_c, \
data, \
cntx \
); \
/*
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
(double*)b11, rs_b, 1, "%5.2f", "" ); \
*/ \
\
/* Broadcast the elements of the updated b11 submatrix to their
duplicated neighbors. */ \
PASTEMAC(ch,bcastbbs_mxn) \
( \
mr, \
nr, \
b11, rs_b, cs_b \
); \
\
/*
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \
( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )

View File

@@ -0,0 +1,206 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// An implementation that indexes through B with the assumption that all
// elements were broadcast (duplicated) by a factor of NP/NR.
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
\
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
\
const dim_t m = mr; \
const dim_t n = nr; \
\
const inc_t rs_a = 1; \
const inc_t cs_a = packmr; \
\
const inc_t rs_b = packnr; \
\
/* Assume that the degree of duplication is equal to packnr / nr. */ \
const inc_t cs_b = packnr / nr; \
\
dim_t iter, i, j, l; \
dim_t n_behind; \
\
for ( iter = 0; iter < m; ++iter ) \
{ \
i = iter; \
n_behind = i; \
\
ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \
ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \
ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \
ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \
\
/* b1 = b1 - a10t * B0; */ \
/* b1 = b1 / alpha11; */ \
for ( j = 0; j < n; ++j ) \
{ \
ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \
ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
ctype beta11c = *beta11; \
ctype rho11; \
\
/* beta11 = beta11 - a10t * b01; */ \
PASTEMAC(ch,set0s)( rho11 ); \
for ( l = 0; l < n_behind; ++l ) \
{ \
ctype* restrict alpha10 = a10t + (l )*cs_a; \
ctype* restrict beta01 = b01 + (l )*rs_b; \
\
PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \
} \
PASTEMAC(ch,subs)( rho11, beta11c ); \
\
/* beta11 = beta11 / alpha11; */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
\
/* Output final result to matrix c. */ \
PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
\
/* Store the local value back to b11. */ \
PASTEMAC(ch,copys)( beta11c, *beta11 ); \
} \
} \
}
INSERT_GENTFUNC_BASIC2( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
\
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
\
const dim_t m = mr; \
const dim_t n = nr; \
\
const inc_t rs_a = 1; \
const inc_t cs_a = packmr; \
\
const inc_t rs_b = packnr; \
\
/* Assume that the degree of duplication is equal to packnr / nr. */ \
const inc_t cs_b = packnr / nr; \
\
dim_t iter, i, j, l; \
dim_t n_behind; \
\
for ( iter = 0; iter < m; ++iter ) \
{ \
i = m - iter - 1; \
n_behind = iter; \
\
ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \
ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \
ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \
ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \
\
/* b1 = b1 - a12t * B2; */ \
/* b1 = b1 / alpha11; */ \
for ( j = 0; j < n; ++j ) \
{ \
ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \
ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
ctype beta11c = *beta11; \
ctype rho11; \
\
/* beta11 = beta11 - a12t * b21; */ \
PASTEMAC(ch,set0s)( rho11 ); \
for ( l = 0; l < n_behind; ++l ) \
{ \
ctype* restrict alpha12 = a12t + (l )*cs_a; \
ctype* restrict beta21 = b21 + (l )*rs_b; \
\
PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \
} \
PASTEMAC(ch,subs)( rho11, beta11c ); \
\
/* beta11 = beta11 / alpha11; */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
\
/* Output final result to matrix c. */ \
PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
\
/* Store the local value back to b11. */ \
PASTEMAC(ch,copys)( beta11c, *beta11 ); \
} \
} \
}
INSERT_GENTFUNC_BASIC2( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )

View File

@@ -1,2 +1,2 @@
2
3
0.0

View File

@@ -403,17 +403,7 @@ void libblis_test_gemm_md
time = bli_clock();
#if 0
bli_printm( "a", &a, "%5.2f", "" );
bli_printm( "b", &b, "%5.2f", "" );
bli_printm( "c", &c, "%5.2f", "" );
bli_printm( "alpha", &alpha, "%5.2f", "" );
bli_printm( "beta", &beta, "%5.2f", "" );
#endif
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
#if 0
bli_printm( "c after", &c, "%5.2f", "" );
#endif
time_min = bli_clock_min_diff( time_min, time );
}

View File

@@ -869,7 +869,8 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
libblis_test_fprintf_c( os, " stack address %d\n", ( int )bli_info_get_stack_buf_align_size() );
libblis_test_fprintf_c( os, " obj_t address %d\n", ( int )bli_info_get_heap_addr_align_size() );
libblis_test_fprintf_c( os, " obj_t stride %d\n", ( int )bli_info_get_heap_stride_align_size() );
libblis_test_fprintf_c( os, " pool block addr %d\n", ( int )bli_info_get_pool_addr_align_size() );
libblis_test_fprintf_c( os, " pool block addr A (+offset) %d (+%d)\n", ( int )bli_info_get_pool_addr_align_size_a(), ( int )bli_info_get_pool_addr_offset_size_a() );
libblis_test_fprintf_c( os, " pool block addr B (+offset) %d (+%d)\n", ( int )bli_info_get_pool_addr_align_size_b(), ( int )bli_info_get_pool_addr_offset_size_b() );
libblis_test_fprintf_c( os, "\n" );
libblis_test_fprintf_c( os, "BLAS/CBLAS compatibility layers \n" );
libblis_test_fprintf_c( os, " BLAS API enabled? %d\n", ( int )bli_info_get_enable_blas() );