Added support for addons.

Details:
- Implemented a new feature called addons, which are similar to
  sandboxes except that there is no requirement to define gemm or any
  other particular operation.
- Updated configure to accept --enable-addon=<name> or -a <name> syntax
  for requesting an addon be included within a BLIS build. configure now
  outputs the list of enabled addons into config.mk. It also outputs the
  corresponding #include directives for the addons' headers to a new
  companion to the bli_config.h header file named bli_addon.h. Because
  addons may wish to make use of existing BLIS types within their own
  definitions, the addons' headers must be included sometime after that
  of bli_config.h (which currently is #included before bli_type_defs.h).
  This is why the #include directives needed to go into a new top-level
  header file rather than the existing bli_config.h file.
- Added a markdown document, docs/Addons.md, to explain addons, how to
  build with them, and what assumptions their authors should keep in
  mind as they create them.
- Added a gemmlike-like implementation of sandwich gemm called 'gemmd'
  as an addon in addon/gemmd. The code uses a 'bao_' prefix for local
  functions, including the user-level object and typed APIs.
- Updated .gitignore so that git ignores bli_addon.h files.

Change-Id: Ie7efdea366481ce25075cb2459bdbcfd52309717
This commit is contained in:
Field G. Van Zee
2021-11-13 16:39:37 -06:00
committed by mkadavil
parent 0792eb8608
commit 7a0ba4194f
35 changed files with 4965 additions and 34 deletions

1
.gitignore vendored
View File

@@ -31,6 +31,7 @@
config.mk
bli_config.h
bli_addon.h
# -- monolithic headers --

View File

@@ -116,6 +116,7 @@ BASE_OBJ_FRAME_PATH := $(BASE_OBJ_PATH)/$(FRAME_DIR)
BASE_OBJ_AOCLDTL_PATH := $(BASE_OBJ_PATH)/$(AOCLDTL_DIR)
BASE_OBJ_REFKERN_PATH := $(BASE_OBJ_PATH)/$(REFKERN_DIR)
BASE_OBJ_KERNELS_PATH := $(BASE_OBJ_PATH)/$(KERNELS_DIR)
BASE_OBJ_ADDON_PATH := $(BASE_OBJ_PATH)/$(ADDON_DIR)
BASE_OBJ_SANDBOX_PATH := $(BASE_OBJ_PATH)/$(SANDBOX_DIR)
# --- Define install target names for static libraries ---
@@ -237,6 +238,9 @@ endif
MK_AOCLDTL_OBJS := $(call gen-obj-paths-from-src,$(AOCLDTL_SRC_SUFS),$(MK_AOCLDTL_SRC),$(AOCLDTL_PATH),$(BASE_OBJ_AOCLDTL_PATH))
# Generate object file paths for the addon source code. If one or more addons
# were not enabled a configure-time, this variable will we empty.
MK_ADDON_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH))
# Generate object file paths for the sandbox source code. If a sandbox was not
# enabled a configure-time, this variable will we empty.
@@ -248,6 +252,7 @@ MK_BLIS_OBJS := $(MK_CONFIG_OBJS) \
$(MK_REFKERN_OBJS) \
$(MK_FRAME_OBJS) \
$(MK_AOCLDTL_OBJS) \
$(MK_ADDON_OBJS) \
$(MK_SANDBOX_OBJS)
# Optionally filter out the BLAS and CBLAS compatibility layer object files.
@@ -588,6 +593,28 @@ else
endif
endef
# first argument: a configuration name from the union of config_list and
# config_name, used to look up the CFLAGS to use during compilation.
define make-c99-addon-rule
$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
ifeq ($(ENABLE_VERBOSE),yes)
$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@
else
@echo "Compiling $$@" $(call get-addon-c99text-for,$(1))
@$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@
endif
endef
define make-cxx-addon-rule
$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
ifeq ($(ENABLE_VERBOSE),yes)
$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@
else
@echo "Compiling $$@" $(call get-addon-cxxtext-for,$(1))
@$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@
endif
endef
# first argument: a configuration name from the union of config_list and
# config_name, used to look up the CFLAGS to use during compilation.
define make-c99-sandbox-rule
@@ -648,6 +675,16 @@ $(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf))))
$(foreach suf, $(KERNELS_SRC_SUFS), \
$(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf)))))
# Instantiate the build rule for C addon files. Use the CFLAGS for the
# configuration family.
$(foreach suf, $(ADDON_C99_SUFS), \
$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf)))))
# Instantiate the build rule for C++ addon files. Use the CFLAGS for the
# configuration family.
$(foreach suf, $(ADDON_CXX_SUFS), \
$(foreach conf, $(CONFIG_NAME), $(eval $(call make-cxx-addon-rule,$(conf),$(suf)))))
# Instantiate the build rule for C sandbox files. Use the CFLAGS for the
# configuration family.
$(foreach suf, $(SANDBOX_C99_SUFS), \
@@ -1141,6 +1178,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(FIND) $(AOCLDTL_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
ifneq ($(ADDON_LIST),)
- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
endif
ifneq ($(SANDBOX),)
- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
endif
@@ -1155,6 +1195,10 @@ else
@- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)"
@- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
ifneq ($(ADDON_LIST),)
@echo "Removing makefile fragments from $(ADDON_FRAG_PATH)"
@- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
endif
ifneq ($(SANDBOX),)
@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)"
@- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
@@ -1275,6 +1319,7 @@ endif # IS_CONFIGURED
distclean: cleanmk cleanh cleanlib cleantest
ifeq ($(IS_CONFIGURED),yes)
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(BLIS_ADDON_H)
- $(RM_F) $(BLIS_CONFIG_H)
- $(RM_F) $(CONFIG_MK_FILE)
- $(RM_F) $(PC_OUT_FILE)
@@ -1282,6 +1327,8 @@ ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_RF) $(LIB_DIR)
- $(RM_RF) $(INCLUDE_DIR)
else
@echo "Removing $(BLIS_ADDON_H)"
@$(RM_F) $(BLIS_ADDON_H)
@echo "Removing $(BLIS_CONFIG_H)"
@$(RM_F) $(BLIS_CONFIG_H)
@echo "Removing $(CONFIG_MK_FILE)"

View File

@@ -0,0 +1,88 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_ex
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// A switch to easily toggle whether we use the addon implementation
// of bao_gemmd() as the implementation for bli_gemm(). (This allows for
// easy testing of bao_gemmd() via the testsuite.)
if ( 1 )
{
const dim_t k = bli_obj_width_after_trans( a );
const num_t dt = bli_obj_dt( c );
obj_t d;
bli_obj_create( dt, k, 1, 1, k, &d );
bli_setv( &BLIS_ONE, &d );
//bli_randv( &d );
bao_gemmd_ex( alpha, a, &d, b, beta, c, cntx, rntm );
bli_obj_free( &d );
return;
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Obtain a valid (native) context from the gks if necessary.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// Invoke the operation's front end.
bli_gemm_front
(
alpha, a, b, beta, c, cntx, rntm, NULL
);
}

305
addon/gemmd/bao_gemmd.c Normal file
View File

@@ -0,0 +1,305 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// -- Define the gemmd operation's object API ----------------------------------
//
void bao_gemmd
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c
)
{
bao_gemmd_ex
(
alpha,
a,
d,
b,
beta,
c,
NULL,
NULL
);
}
void bao_gemmd_ex
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Obtain a valid (native) context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Check parameters.
if ( bli_error_checking_is_enabled() )
bao_gemmd_check( alpha, a, d, b, beta, c, cntx );
// -- bli_gemmd_front() ----------------------------------------------------
obj_t a_local;
obj_t b_local;
obj_t c_local;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Induce a transposition of A if it has its transposition property set.
// Then clear the transposition bit in the object.
if ( bli_obj_has_trans( &a_local ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
// Induce a transposition of B if it has its transposition property set.
// Then clear the transposition bit in the object.
if ( bli_obj_has_trans( &b_local ) )
{
bli_obj_induce_trans( &b_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local );
}
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Spawn threads (if applicable), where bao_gemmd_int() is the thread entry
// point function for each thread. This also begins the process of creating
// the thrinfo_t tree, which contains thread communicators.
bao_l3_thread_decorator
(
bao_gemmd_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
d,
&b_local,
beta,
&c_local,
cntx,
rntm
);
}
//
// -- Define the gemmd operation's thread entry point --------------------------
//
void bao_gemmd_int
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
// In this function, we choose the gemmd implementation that is executed
// on each thread.
#if 1
// Call the block-panel algorithm that calls the kernel directly, which
// exposes edge-case handling.
bao_gemmd_bp_var1
(
alpha,
a,
d,
b,
beta,
c,
cntx,
rntm,
thread
);
#else
// Call the block-panel algorithm that calls the kernel indirectly via a
// wrapper function, which hides edge-case handling.
bao_gemmd_bp_var2
(
alpha,
a,
d,
b,
beta,
c,
cntx,
rntm,
thread
);
#endif
}
//
// -- Define the gemmd operation's typed API -----------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* d, inc_t incd, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
bli_init_once(); \
\
/* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on
the macro parameter 'ch' (e.g. s, d, etc). */ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, ao, dd, bo, betao, co; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
/* Adjust the dimensions of matrices A and B according to the transa and
transb parameters. */ \
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
\
/* Create bufferless scalar objects and attach the provided scalar pointers
to those scalar objects. */ \
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
\
/* Create bufferless matrix objects and attach the provided matrix pointers
to those matrix objects. */ \
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_create_with_attached_buffer( dt, k, 1, d, incd, k, &dd ); \
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
\
/* Set the transposition/conjugation properties of the objects for matrices
A and B. */ \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
/* Call the object interface. */ \
PASTECH(bao_,opname) \
( \
&alphao, \
&ao, \
&dd, \
&bo, \
&betao, \
&co \
); \
}
//INSERT_GENTFUNC_BASIC0( gemmd )
GENTFUNC( float, s, gemmd )
GENTFUNC( double, d, gemmd )
GENTFUNC( scomplex, c, gemmd )
GENTFUNC( dcomplex, z, gemmd )

105
addon/gemmd/bao_gemmd.h Normal file
View File

@@ -0,0 +1,105 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Prototype the gemmd operation's object API -------------------------------
//
BLIS_EXPORT_ADDON void bao_gemmd
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c
);
BLIS_EXPORT_ADDON void bao_gemmd_ex
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
//
// -- Prototype the gemmd operation's thread entry point -----------------------
//
void bao_gemmd_int
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
//
// -- Prototype the gemmd operation's typed API --------------------------------
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* d, inc_t incd, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
//INSERT_GENTPROT_BASIC0( gemmd )
GENTPROT( float, s, gemmd )
GENTPROT( double, d, gemmd )
GENTPROT( scomplex, c, gemmd )
GENTPROT( dcomplex, z, gemmd )

View File

@@ -0,0 +1,530 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmd_fp
typedef void (*FUNCPTR_T)
(
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict d, inc_t incd,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
thrinfo_t* restrict thread
);
//
// -- gemmd-like block-panel algorithm (object interface) ----------------------
//
// Define a function pointer array named ftypes and initialize its contents with
// the addresses of the typed functions defined below, bao_?gemmd_bp_var1().
static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1);
void bao_gemmd_bp_var1
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const num_t dt = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( a );
void* restrict buf_a = bli_obj_buffer_at_off( a );
const inc_t rs_a = bli_obj_row_stride( a );
const inc_t cs_a = bli_obj_col_stride( a );
void* restrict buf_d = bli_obj_buffer_at_off( d );
const inc_t incd = bli_obj_vector_inc( d );
void* restrict buf_b = bli_obj_buffer_at_off( b );
const inc_t rs_b = bli_obj_row_stride( b );
const inc_t cs_b = bli_obj_col_stride( b );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
// Index into the function pointer array to extract the correct
// typed function pointer based on the chosen datatype.
FUNCPTR_T f = ftypes[dt];
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_d, incd,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread
);
}
//
// -- gemmd-like block-panel algorithm (typed interface) -----------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict d, inc_t incd, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \
const inc_t jcstep_b = cs_b; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_d = incd; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = rs_c; \
const inc_t icstep_a = rs_a; \
\
const inc_t jrstep_c = cs_c * NR; \
\
const inc_t irstep_c = rs_c * MR; \
\
ctype* restrict a_00 = a; \
ctype* restrict d_00 = d; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of the scalars to prevent any unnecessary sharing of
cache lines between the cores' caches. */ \
ctype alpha_local = *alpha_cast; \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
ctype zero_local = *PASTEMAC(ch,0); \
\
auxinfo_t aux; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. */ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree. */ \
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
BLIS_KC, /* 4th loop */ \
BLIS_NO_PART, /* pack B */ \
BLIS_MC, /* 3rd loop */ \
BLIS_NO_PART, /* pack A */ \
BLIS_NR, /* 2nd loop */ \
BLIS_MR, /* 1st loop */ \
BLIS_KR }; /* microkernel loop */ \
\
bszid_t* restrict bszids_jc = &bszids[0]; \
bszid_t* restrict bszids_pc = &bszids[1]; \
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
bszid_t* restrict bszids_ic = &bszids[3]; \
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
bszid_t* restrict bszids_jr = &bszids[5]; \
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
thrinfo_t* restrict thread_ir = NULL; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
const dim_t n_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
\
/* Compute the PC loop thread range for the current thread. */ \
const dim_t pc_start = 0, pc_end = k; \
const dim_t k_local = k; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
const dim_t pc_left = k_local % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict d_pc = d_00 + pp * pcstep_d; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
\
/* Determine the packing buffer and related parameters for matrix
B. Then call the packm implementation. */ \
PASTECH2(bao_,ch,packm_b) \
( \
conjb, \
KC, NC, \
kc_cur, nc_cur, NR, \
&one_local, \
d_pc, incd, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
const dim_t m_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
\
/* Determine the packing buffer and related parameters for matrix
A. Then call the packm implementation. */ \
PASTECH2(bao_,ch,packm_a) \
( \
conja, \
MC, KC, \
mc_cur, kc_cur, MR, \
&one_local, \
d_pc, incd, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_ic_use = a_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Query the number of threads and thread ids for the JR loop.
NOTE: These values are only needed when computing the next
micropanel of B. */ \
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
{ \
const dim_t nr_cur \
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Assume for now that our next panel of B to be the current panel
of B. */ \
ctype* restrict b2 = b_jr; \
\
/* Identify the current thrinfo_t node. */ \
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
\
/* Query the number of threads and thread ids for the IR loop.
NOTE: These values are only needed when computing the next
micropanel of A. */ \
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
\
/* Compute number of primary and leftover components of the IR loop. */ \
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
dim_t ir_left = mc_cur % MR; \
\
/* Compute the IR loop thread range for the current thread. */ \
dim_t ir_start, ir_end; \
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
{ \
const dim_t mr_cur \
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
\
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
ctype* restrict c_ir = c_jr + i * irstep_c; \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next micropanels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_ic_use; \
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_pc_use; \
} \
\
/* Save the addresses of next micropanels of A and B to the
auxinfo_t object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( mr_cur == MR && nr_cur == NR ) \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
&alpha_local, \
a_ir, \
b_jr, \
beta_use, \
c_ir, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
&alpha_local, \
a_ir, \
b_jr, \
&zero_local, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn) \
( \
mr_cur, \
nr_cur, \
ct, rs_ct, cs_ct, \
beta_use, \
c_ir, rs_c, cs_c \
); \
} \
} \
} \
} \
\
/* This barrier is needed to prevent threads from starting to pack
the next row panel of B before the current row panel is fully
computed upon. */ \
bli_thread_barrier( thread_pb ); \
} \
} \
\
/* Release any memory that was acquired for packing matrices A and B. */ \
PASTECH2(bao_,ch,packm_finalize_mem_a) \
( \
rntm, \
&mem_a, \
thread_pa \
); \
PASTECH2(bao_,ch,packm_finalize_mem_b) \
( \
rntm, \
&mem_b, \
thread_pb \
); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
*/ \
}
//INSERT_GENTFUNC_BASIC0( gemmd_bp_var1 )
GENTFUNC( float, s, gemmd_bp_var1 )
GENTFUNC( double, d, gemmd_bp_var1 )
GENTFUNC( scomplex, c, gemmd_bp_var1 )
GENTFUNC( dcomplex, z, gemmd_bp_var1 )

View File

@@ -0,0 +1,602 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmd_fp
typedef void (*FUNCPTR_T)
(
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict d, inc_t incd,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
thrinfo_t* restrict thread
);
//
// -- gemmd-like block-panel algorithm (object interface) ----------------------
//
// Define a function pointer array named ftypes and initialize its contents with
// the addresses of the typed functions defined below, bao_?gemmd_bp_var2().
static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var2);
void bao_gemmd_bp_var2
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const num_t dt = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( a );
void* restrict buf_a = bli_obj_buffer_at_off( a );
const inc_t rs_a = bli_obj_row_stride( a );
const inc_t cs_a = bli_obj_col_stride( a );
void* restrict buf_d = bli_obj_buffer_at_off( d );
const inc_t incd = bli_obj_vector_inc( d );
void* restrict buf_b = bli_obj_buffer_at_off( b );
const inc_t rs_b = bli_obj_row_stride( b );
const inc_t cs_b = bli_obj_col_stride( b );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
// Index into the function pointer array to extract the correct
// typed function pointer based on the chosen datatype.
FUNCPTR_T f = ftypes[dt];
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_d, incd,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread
);
}
//
// -- gemmd-like block-panel algorithm (typed interface) -----------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict d, inc_t incd, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
/*
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
*/ \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
/*
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
*/ \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \
const inc_t jcstep_b = cs_b; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_d = incd; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = rs_c; \
const inc_t icstep_a = rs_a; \
\
const inc_t jrstep_c = cs_c * NR; \
\
const inc_t irstep_c = rs_c * MR; \
\
ctype* restrict a_00 = a; \
ctype* restrict d_00 = d; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of the scalars to prevent any unnecessary sharing of
cache lines between the cores' caches. */ \
ctype alpha_local = *alpha_cast; \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
/*ctype zero_local = *PASTEMAC(ch,0);*/ \
\
auxinfo_t aux; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. */ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree. */ \
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
BLIS_KC, /* 4th loop */ \
BLIS_NO_PART, /* pack B */ \
BLIS_MC, /* 3rd loop */ \
BLIS_NO_PART, /* pack A */ \
BLIS_NR, /* 2nd loop */ \
BLIS_MR, /* 1st loop */ \
BLIS_KR }; /* microkernel loop */ \
\
bszid_t* restrict bszids_jc = &bszids[0]; \
bszid_t* restrict bszids_pc = &bszids[1]; \
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
bszid_t* restrict bszids_ic = &bszids[3]; \
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
bszid_t* restrict bszids_jr = &bszids[5]; \
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
thrinfo_t* restrict thread_ir = NULL; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
const dim_t n_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
\
/* Compute the PC loop thread range for the current thread. */ \
const dim_t pc_start = 0, pc_end = k; \
const dim_t k_local = k; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
const dim_t pc_left = k_local % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict d_pc = d_00 + pp * pcstep_d; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
\
/* Determine the packing buffer and related parameters for matrix
B. Then call the packm implementation. */ \
PASTECH2(bao_,ch,packm_b) \
( \
conjb, \
KC, NC, \
kc_cur, nc_cur, NR, \
&one_local, \
d_pc, incd, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
const dim_t m_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Identify the current thrinfo_t node. Note that the thrinfo_t
node will have already been created by a previous call to
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
cause the tree to grow by two (e.g. to the next bszid that is
a normal bszid_t value). */ \
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
\
/* Determine the packing buffer and related parameters for matrix
A. Then call the packm implementation. */ \
PASTECH2(bao_,ch,packm_a) \
( \
conja, \
MC, KC, \
mc_cur, kc_cur, MR, \
&one_local, \
d_pc, incd, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_ic_use = a_use; \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Query the number of threads and thread ids for the JR loop.
NOTE: These values are only needed when computing the next
micropanel of B. */ \
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
{ \
const dim_t nr_cur \
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Assume for now that our next panel of B to be the current panel
of B. */ \
ctype* restrict b2 = b_jr; \
\
/* Identify the current thrinfo_t node. */ \
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
\
/* Query the number of threads and thread ids for the IR loop.
NOTE: These values are only needed when computing the next
micropanel of A. */ \
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
\
/* Compute number of primary and leftover components of the IR loop. */ \
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
dim_t ir_left = mc_cur % MR; \
\
/* Compute the IR loop thread range for the current thread. */ \
dim_t ir_start, ir_end; \
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
{ \
const dim_t mr_cur \
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
\
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
ctype* restrict c_ir = c_jr + i * irstep_c; \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next micropanels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_ic_use; \
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_pc_use; \
} \
\
/* Save the addresses of next micropanels of A and B to the
auxinfo_t object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Call a wrapper to the kernel (which handles edge cases). */ \
PASTECH2(bao_,ch,gemm_kernel) \
( \
MR, \
NR, \
mr_cur, \
nr_cur, \
kc_cur, \
&alpha_local, \
a_ir, rs_a_use, cs_a_use, \
b_jr, rs_b_use, cs_b_use, \
beta_use, \
c_ir, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* This barrier is needed to prevent threads from starting to pack
the next row panel of B before the current row panel is fully
computed upon. */ \
bli_thread_barrier( thread_pb ); \
} \
} \
\
/* Release any memory that was acquired for packing matrices A and B. */ \
PASTECH2(bao_,ch,packm_finalize_mem_a) \
( \
rntm, \
&mem_a, \
thread_pa \
); \
PASTECH2(bao_,ch,packm_finalize_mem_b) \
( \
rntm, \
&mem_b, \
thread_pb \
); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
*/ \
}
//INSERT_GENTFUNC_BASIC0( gemmd_bp_var2 )
GENTFUNC( float, s, gemmd_bp_var2 )
GENTFUNC( double, d, gemmd_bp_var2 )
GENTFUNC( scomplex, c, gemmd_bp_var2 )
GENTFUNC( dcomplex, z, gemmd_bp_var2 )
//
// -- gemm-like microkernel wrapper --------------------------------------------
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
const dim_t MR, \
const dim_t NR, \
dim_t mr_cur, \
dim_t nr_cur, \
dim_t kc_cur, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict aux, \
cntx_t* restrict cntx \
) \
{ \
/* Infer the datatype from the ctype. */ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for the microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype zero = *PASTEMAC(ch,0); \
\
/* Handle interior and edge cases separately. */ \
if ( mr_cur == MR && nr_cur == NR ) \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
alpha, \
a, \
b, \
beta, \
c, rs_c, cs_c, \
aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm microkernel. */ \
gemm_ukr \
( \
kc_cur, \
alpha, \
a, \
b, \
&zero, \
ct, rs_ct, cs_ct, \
aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn) \
( \
mr_cur, \
nr_cur, \
ct, rs_ct, cs_ct, \
beta, \
c, rs_c, cs_c \
); \
} \
}
//INSERT_GENTFUNC_BASIC0( gemm_kernel )
GENTFUNC( float, s, gemm_kernel )
GENTFUNC( double, d, gemm_kernel )
GENTFUNC( scomplex, c, gemm_kernel )
GENTFUNC( dcomplex, z, gemm_kernel )

View File

@@ -0,0 +1,131 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bao_gemmd_check
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( d );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( b );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( c );
bli_check_error_code( e_val );
// Check scalar/vector/matrix type.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( d );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( b );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( c );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( d );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( b );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( c );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_level3_dims( a, b, c );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) );
bli_check_error_code( e_val );
// Check for consistent datatypes.
// NOTE: We only perform these tests when mixed datatype support is
// disabled.
e_val = bli_check_consistent_object_datatypes( c, a );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, d );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, b );
bli_check_error_code( e_val );
}

View File

@@ -0,0 +1,50 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
void bao_gemmd_check
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);

126
addon/gemmd/bao_gemmd_var.h Normal file
View File

@@ -0,0 +1,126 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype the object-based variant interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTECH(bao_,opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* d, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
);
GENPROT( gemmd_bp_var1 )
GENPROT( gemmd_bp_var2 )
//
// Prototype the typed variant interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict d, inc_t incd, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
);
//INSERT_GENTPROT_BASIC0( gemmd_bp_var1 )
GENTPROT( float, s, gemmd_bp_var1 )
GENTPROT( double, d, gemmd_bp_var1 )
GENTPROT( scomplex, c, gemmd_bp_var1 )
GENTPROT( dcomplex, z, gemmd_bp_var1 )
//INSERT_GENTPROT_BASIC0( gemmd_bp_var2 )
GENTPROT( float, s, gemmd_bp_var2 )
GENTPROT( double, d, gemmd_bp_var2 )
GENTPROT( scomplex, c, gemmd_bp_var2 )
GENTPROT( dcomplex, z, gemmd_bp_var2 )
//
// Prototype the typed kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
const dim_t MR, \
const dim_t NR, \
dim_t mr_cur, \
dim_t nr_cur, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict aux, \
cntx_t* restrict cntx \
);
//INSERT_GENTPROT_BASIC0( gemm_kernel )
GENTPROT( float, s, gemm_kernel )
GENTPROT( double, d, gemm_kernel )
GENTPROT( scomplex, c, gemm_kernel )
GENTPROT( dcomplex, z, gemm_kernel )

View File

@@ -0,0 +1,330 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Set the pack buffer type so that we are obtaining memory blocks from
the pool dedicated to blocks of A. */ \
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
\
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
const dim_t k_pack = k; \
\
/* Barrier to make sure all threads are caught up and ready to begin the
packm stage. */ \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
\
/* Check the mem_t entry provided by the caller. If it is unallocated,
then we need to acquire a block from the packed block allocator. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was passed in.
It needs to be that mem_t struct, and not a local (temporary)
mem_t, since there is no barrier until after packing is finished,
which could allow a race condition whereby the chief thread exits
the current function before the other threads have a chance to
copy from it. (A barrier would fix that race condition, but then
again, I prefer to keep barriers to a minimum.) */ \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t to all
threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the packed
block allocator and cached by the caller. */ \
\
/* As a sanity check, we should make sure that the mem_t object isn't
associated with a block that is too small compared to the size of
the packed matrix buffer that is needed, according to the value
computed above. */ \
siz_t mem_size = bli_mem_size( mem ); \
\
if ( mem_size < size_needed ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_pba_release \
( \
rntm, \
mem \
); \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
/* If the mem_t entry is already allocated and sufficiently large,
then we use it as-is. No action is needed. */ \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
GENTFUNC( float, s, packm_init_mem_a )
GENTFUNC( double, d, packm_init_mem_a )
GENTFUNC( scomplex, c, packm_init_mem_a )
GENTFUNC( dcomplex, z, packm_init_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_pba_release \
( \
rntm, \
mem \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
GENTFUNC( float, s, packm_finalize_mem_a )
GENTFUNC( double, d, packm_finalize_mem_a )
GENTFUNC( scomplex, c, packm_finalize_mem_a )
GENTFUNC( dcomplex, z, packm_finalize_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
) \
{ \
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
*k_max = k; \
\
/* Determine the dimensions and strides for the packed matrix A. */ \
{ \
/* Pack A to column-stored row-panels. */ \
*rs_p = 1; \
*cs_p = mr; \
\
*pd_p = mr; \
*ps_p = mr * k; \
\
/* Set the schema to "packed row panels" to indicate packing to
conventional column-stored row panels. */ \
*schema = BLIS_PACKED_ROW_PANELS; \
} \
\
/* Set the buffer address provided by the caller to point to the memory
associated with the mem_t entry acquired from the memory pool. */ \
*p = bli_mem_buffer( mem ); \
}
//INSERT_GENTFUNC_BASIC0( packm_init_a )
GENTFUNC( float, s, packm_init_a )
GENTFUNC( double, d, packm_init_a )
GENTFUNC( scomplex, c, packm_init_a )
GENTFUNC( dcomplex, z, packm_init_a )
//
// Define BLAS-like interfaces to the variant chooser.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
conj_t conj, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
pack_t schema; \
dim_t m_max; \
dim_t k_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. */ \
PASTECH2(bao_,ch,packm_init_mem_a) \
( \
m_alloc, k_alloc, mr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix A. */ \
PASTECH2(bao_,ch,packm_init_a) \
( \
&schema, \
m, k, mr, \
&m_max, &k_max, \
p, rs_p, cs_p, \
&pd_p, ps_p, \
mem \
); \
\
/* Pack matrix A to the destination buffer chosen above. Here, the packed
matrix is stored to column-stored MR x k micropanels. */ \
PASTECH2(bao_,ch,packm_var1) \
( \
conj, \
schema, \
m, \
k, \
m_max, \
k_max, \
kappa, \
d, incd, \
a, rs_a, cs_a, \
*p, *rs_p, *cs_p, \
pd_p, *ps_p, \
cntx, \
thread \
); \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_barrier( thread ); \
}
//INSERT_GENTFUNC_BASIC0( packm_a )
GENTFUNC( float, s, packm_a )
GENTFUNC( double, d, packm_a )
GENTFUNC( scomplex, c, packm_a )
GENTFUNC( dcomplex, z, packm_a )

View File

@@ -0,0 +1,123 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_init_mem_a )
GENTPROT( float, s, packm_init_mem_a )
GENTPROT( double, d, packm_init_mem_a )
GENTPROT( scomplex, c, packm_init_mem_a )
GENTPROT( dcomplex, z, packm_init_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
GENTPROT( float, s, packm_finalize_mem_a )
GENTPROT( double, d, packm_finalize_mem_a )
GENTPROT( scomplex, c, packm_finalize_mem_a )
GENTPROT( dcomplex, z, packm_finalize_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
); \
//INSERT_GENTPROT_BASIC0( packm_init_a )
GENTPROT( float, s, packm_init_a )
GENTPROT( double, d, packm_init_a )
GENTPROT( scomplex, c, packm_init_a )
GENTPROT( dcomplex, z, packm_init_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
conj_t conj, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_a )
GENTPROT( float, s, packm_a )
GENTPROT( double, d, packm_a )
GENTPROT( scomplex, c, packm_a )
GENTPROT( dcomplex, z, packm_a )

View File

@@ -0,0 +1,330 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
dim_t k, \
dim_t n, \
dim_t nr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Set the pack buffer type so that we are obtaining memory blocks from
the pool dedicated to panels of B. */ \
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \
\
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
const dim_t k_pack = k; \
const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
\
/* Barrier to make sure all threads are caught up and ready to begin the
packm stage. */ \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
\
/* Check the mem_t entry provided by the caller. If it is unallocated,
then we need to acquire a block from the packed block allocator. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was passed in.
It needs to be that mem_t struct, and not a local (temporary)
mem_t, since there is no barrier until after packing is finished,
which could allow a race condition whereby the chief thread exits
the current function before the other threads have a chance to
copy from it. (A barrier would fix that race condition, but then
again, I prefer to keep barriers to a minimum.) */ \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t to all
threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the packed
block allocator and cached by the caller. */ \
\
/* As a sanity check, we should make sure that the mem_t object isn't
associated with a block that is too small compared to the size of
the packed matrix buffer that is needed, according to the value
computed above. */ \
siz_t mem_size = bli_mem_size( mem ); \
\
if ( mem_size < size_needed ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_pba_release \
( \
rntm, \
mem \
); \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
/* If the mem_t entry is already allocated and sufficiently large,
then we use it as-is. No action is needed. */ \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_init_mem_b )
GENTFUNC( float, s, packm_init_mem_b )
GENTFUNC( double, d, packm_init_mem_b )
GENTFUNC( scomplex, c, packm_init_mem_b )
GENTFUNC( dcomplex, z, packm_init_mem_b )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_pba_release \
( \
rntm, \
mem \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
GENTFUNC( float, s, packm_finalize_mem_b )
GENTFUNC( double, d, packm_finalize_mem_b )
GENTFUNC( scomplex, c, packm_finalize_mem_b )
GENTFUNC( dcomplex, z, packm_finalize_mem_b )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t k, \
dim_t n, \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
) \
{ \
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
*k_max = k; \
*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
\
/* Determine the dimensions and strides for the packed matrix B. */ \
{ \
/* Pack B to row-stored column-panels. */ \
*rs_p = nr; \
*cs_p = 1; \
\
*pd_p = nr; \
*ps_p = k * nr; \
\
/* Set the schema to "packed column panels" to indicate packing to
conventional row-stored column panels. */ \
*schema = BLIS_PACKED_COL_PANELS; \
} \
\
/* Set the buffer address provided by the caller to point to the memory
associated with the mem_t entry acquired from the memory pool. */ \
*p = bli_mem_buffer( mem ); \
}
//INSERT_GENTFUNC_BASIC0( packm_init_b )
GENTFUNC( float, s, packm_init_b )
GENTFUNC( double, d, packm_init_b )
GENTFUNC( scomplex, c, packm_init_b )
GENTFUNC( dcomplex, z, packm_init_b )
//
// Define BLAS-like interfaces to the variant chooser.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
conj_t conj, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
pack_t schema; \
dim_t k_max; \
dim_t n_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. */ \
PASTECH2(bao_,ch,packm_init_mem_b) \
( \
k_alloc, n_alloc, nr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix B. */ \
PASTECH2(bao_,ch,packm_init_b) \
( \
&schema, \
k, n, nr, \
&k_max, &n_max, \
p, rs_p, cs_p, \
&pd_p, ps_p, \
mem \
); \
\
/* Pack matrix B to the destination buffer chosen above. Here, the packed
matrix is stored to row-stored k x NR micropanels. */ \
PASTECH2(bao_,ch,packm_var1) \
( \
conj, \
schema, \
k, \
n, \
k_max, \
n_max, \
kappa, \
d, incd, \
b, rs_b, cs_b, \
*p, *rs_p, *cs_p, \
pd_p, *ps_p, \
cntx, \
thread \
); \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_barrier( thread ); \
}
//INSERT_GENTFUNC_BASIC0( packm_b )
GENTFUNC( float, s, packm_b )
GENTFUNC( double, d, packm_b )
GENTFUNC( scomplex, c, packm_b )
GENTFUNC( dcomplex, z, packm_b )

View File

@@ -0,0 +1,123 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
dim_t k, \
dim_t n, \
dim_t nr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_init_mem_b )
GENTPROT( float, s, packm_init_mem_b )
GENTPROT( double, d, packm_init_mem_b )
GENTPROT( scomplex, c, packm_init_mem_b )
GENTPROT( dcomplex, z, packm_init_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
GENTPROT( float, s, packm_finalize_mem_b )
GENTPROT( double, d, packm_finalize_mem_b )
GENTPROT( scomplex, c, packm_finalize_mem_b )
GENTPROT( dcomplex, z, packm_finalize_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t k, \
dim_t n, \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
); \
//INSERT_GENTPROT_BASIC0( packm_init_b )
GENTPROT( float, s, packm_init_b )
GENTPROT( double, d, packm_init_b )
GENTPROT( scomplex, c, packm_init_b )
GENTPROT( dcomplex, z, packm_init_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
conj_t conj, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
//INSERT_GENTPROT_BASIC0( packm_b )
GENTPROT( float, s, packm_b )
GENTPROT( double, d, packm_b )
GENTPROT( scomplex, c, packm_b )
GENTPROT( dcomplex, z, packm_b )

View File

@@ -0,0 +1,69 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces to the variants.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
);
//INSERT_GENTPROT_BASIC0( packm_var1 )
GENTPROT( float, s, packm_var1 )
GENTPROT( double, d, packm_var1 )
GENTPROT( scomplex, c, packm_var1 )
GENTPROT( dcomplex, z, packm_var1 )
//INSERT_GENTPROT_BASIC0( packm_var2 )
GENTPROT( float, s, packm_var2 )
GENTPROT( double, d, packm_var2 )
GENTPROT( scomplex, c, packm_var2 )
GENTPROT( dcomplex, z, packm_var2 )

View File

@@ -0,0 +1,195 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Variant 1 provides basic support for packing by calling packm_cxk().
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic; \
dim_t ic0; \
doff_t ic_inc; \
dim_t panel_len; \
dim_t panel_len_max; \
dim_t panel_dim; \
dim_t panel_dim_max; \
inc_t incc; \
inc_t ldc; \
inc_t ldp; \
conj_t conjc; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool row_stored = bli_is_col_packed( schema ); \
/*bool col_stored = bli_is_row_packed( schema );*/ \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
} \
\
ctype* restrict p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
( void )nt; \
( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, it = 0; it < n_iter; \
ic += ic_inc, it += 1 ) \
{ \
panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
\
ctype* restrict c_begin = c_cast + (ic )*incc; \
\
ctype* restrict c_use = c_begin; \
ctype* restrict p_use = p_begin; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. (The
default is slab.) */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTECH2(bao_,ch,packm_cxk) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa_cast, \
d, incd, \
c_use, incc, ldc, \
p_use, ldp, \
cntx \
); \
} \
\
/*
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
p_begin += ps_p; \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_var1 )
GENTFUNC( float, s, packm_var1 )
GENTFUNC( double, d, packm_var1 )
GENTFUNC( scomplex, c, packm_var1 )
GENTFUNC( dcomplex, z, packm_var1 )

View File

@@ -0,0 +1,245 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Variant 2 is similar to variant 1, but inlines the contents of packm_cxk().
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic; \
dim_t ic0; \
doff_t ic_inc; \
dim_t panel_len; \
dim_t panel_len_max; \
dim_t panel_dim; \
dim_t panel_dim_max; \
inc_t incc; \
inc_t ldc; \
inc_t ldp; \
conj_t conjc; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool row_stored = bli_is_col_packed( schema ); \
/*bool col_stored = bli_is_row_packed( schema );*/ \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
} \
\
ctype* restrict p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
( void )nt; \
( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, it = 0; it < n_iter; \
ic += ic_inc, it += 1 ) \
{ \
panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
\
ctype* restrict c_begin = c_cast + (ic )*incc; \
\
ctype* restrict c_use = c_begin; \
ctype* restrict p_use = p_begin; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. (The
default is slab.) */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
we're wrong, this will get someone's attention. */ \
if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) \
bli_abort(); \
\
/* Perform the packing, taking conjc into account. */ \
if ( bli_is_conj( conjc ) ) \
{ \
for ( dim_t l = 0; l < panel_len; ++l ) \
{ \
for ( dim_t d = 0; d < panel_dim; ++d ) \
{ \
ctype* cld = c_use + (l )*ldc + (d )*incc; \
ctype* pld = p_use + (l )*ldp + (d )*1; \
\
PASTEMAC(ch,copyjs)( *cld, *pld ); \
} \
} \
} \
else \
{ \
for ( dim_t l = 0; l < panel_len; ++l ) \
{ \
for ( dim_t d = 0; d < panel_dim; ++d ) \
{ \
ctype* cld = c_use + (l )*ldc + (d )*incc; \
ctype* pld = p_use + (l )*ldp + (d )*1; \
\
PASTEMAC(ch,copys)( *cld, *pld ); \
} \
} \
} \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
const dim_t i = panel_dim; \
const dim_t m_edge = panel_dim_max - panel_dim; \
const dim_t n_edge = panel_len_max; \
ctype* restrict p_edge = p_use + (i )*1; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
const dim_t j = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - panel_len; \
ctype* restrict p_edge = p_use + (j )*ldp; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
} \
\
/*
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
p_begin += ps_p; \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_var1 )
GENTFUNC( float, s, packm_var2 )
GENTFUNC( double, d, packm_var2 )
GENTFUNC( scomplex, c, packm_var2 )
GENTFUNC( dcomplex, z, packm_var2 )

199
addon/gemmd/bao_packm_cxk.c Normal file
View File

@@ -0,0 +1,199 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* d, inc_t incd, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
/* NOTE: We've disabled calling packm micro-kernels from the context for
this implementation. To re-enable, change FALSE to TRUE in the
conditional below. */ \
if ( f != NULL && FALSE ) \
{ \
f \
( \
conja, \
schema, \
panel_dim, \
panel_len, \
panel_len_max, \
kappa, \
a, inca, lda, \
p, ldp, \
cntx \
); \
} \
else \
{ \
/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
we're wrong, this will get someone's attention. */ \
if ( !PASTEMAC(ch,eq1)( *kappa ) ) \
bli_abort(); \
\
if ( d == NULL ) \
{ \
/* Perform the packing, taking conja into account. */ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t l = 0; l < panel_len; ++l ) \
{ \
for ( dim_t i = 0; i < panel_dim; ++i ) \
{ \
ctype* ali = a + (l )*lda + (i )*inca; \
ctype* pli = p + (l )*ldp + (i )*1; \
\
PASTEMAC(ch,copyjs)( *ali, *pli ); \
} \
} \
} \
else \
{ \
for ( dim_t l = 0; l < panel_len; ++l ) \
{ \
for ( dim_t i = 0; i < panel_dim; ++i ) \
{ \
ctype* ali = a + (l )*lda + (i )*inca; \
ctype* pli = p + (l )*ldp + (i )*1; \
\
PASTEMAC(ch,copys)( *ali, *pli ); \
} \
} \
} \
} \
else /* if ( d != NULL ) */ \
{ \
/* Perform the packing, taking conja into account. */ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t l = 0; l < panel_len; ++l ) \
{ \
for ( dim_t i = 0; i < panel_dim; ++i ) \
{ \
ctype* ali = a + (l )*lda + (i )*inca; \
ctype* dl = d + (l )*incd; \
ctype* pli = p + (l )*ldp + (i )*1; \
\
/* Note that ali must be the second operand here since
that is what is conjugated by scal2js. */ \
PASTEMAC(ch,scal2js)( *dl, *ali, *pli ); \
} \
} \
} \
else \
{ \
for ( dim_t l = 0; l < panel_len; ++l ) \
{ \
for ( dim_t i = 0; i < panel_dim; ++i ) \
{ \
ctype* ali = a + (l )*lda + (i )*inca; \
ctype* dl = d + (l )*incd; \
ctype* pli = p + (l )*ldp + (i )*1; \
\
PASTEMAC(ch,scal2s)( *ali, *dl, *pli ); \
} \
} \
} \
} \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
const dim_t i = panel_dim; \
const dim_t m_edge = panel_dim_max - panel_dim; \
const dim_t n_edge = panel_len_max; \
ctype* restrict p_edge = p + (i )*1; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
const dim_t j = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - panel_len; \
ctype* restrict p_edge = p + (j )*ldp; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_cxk )
GENTFUNC( float, s, packm_cxk )
GENTFUNC( double, d, packm_cxk )
GENTFUNC( scomplex, c, packm_cxk )
GENTFUNC( dcomplex, z, packm_cxk )

View File

@@ -0,0 +1,59 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTECH2(bao_,ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* d, inc_t incd, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
);
//INSERT_GENTPROT_BASIC0( packm_cxk )
GENTPROT( float, s, packm_cxk )
GENTPROT( double, d, packm_cxk )
GENTPROT( scomplex, c, packm_cxk )
GENTPROT( dcomplex, z, packm_cxk )

54
addon/gemmd/gemmd.h Normal file
View File

@@ -0,0 +1,54 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of copyright holder(s) nor the names
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef GEMMD_H
#define GEMMD_H
// This header should contain (or #include) any definitions that must be
// folded into blis.h.
#include "bao_gemmd.h"
#include "bao_gemmd_check.h"
#include "bao_gemmd_var.h"
#include "bao_l3_packm_a.h"
#include "bao_l3_packm_b.h"
#include "bao_l3_packm_var.h"
#include "bao_packm_cxk.h"
#include "bao_l3_decor.h"
#endif

View File

@@ -0,0 +1,75 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_H
#define BLIS_SBX_L3_DECOR_H
// -- sup definitions ----------------------------------------------------------
// Level-3 sup internal function type.
typedef void (*l3sbxint_t)
(
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
// Level-3 sup thread decorator prototype.
void bao_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
// Include definitions specific to the method of multithreading.
#include "bao_l3_decor_single.h"
#include "bao_l3_decor_openmp.h"
#include "bao_l3_decor_pthreads.h"
#endif

View File

@@ -0,0 +1,140 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_OPENMP
// Define a dummy thread entry function, which is needed in the pthreads
// version, so that when building Windows DLLs (with OpenMP enabled or with
// no multithreading) we don't risk having an unresolved symbol.
void* bao_l3_thread_entry( void* data_void ) { return NULL; }
//#define PRINT_THRINFO
void bao_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// Query the total number of threads from the rntm_t object.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_pba_rntm_set_pba( rntm );
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
thrinfo_t* thread = NULL;
// Create the root node of the thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
func
(
alpha,
a,
d,
b,
beta,
c,
cntx,
rntm_p,
thread
);
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
#define BLIS_SBX_L3_DECOR_OPENMP_H
// Definitions specific to situations when OpenMP multithreading is enabled.
#ifdef BLIS_ENABLE_OPENMP
#endif
#endif

View File

@@ -0,0 +1,220 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_PTHREADS
// A data structure to assist in passing operands to additional threads.
typedef struct thread_data
{
l3sbxint_t func;
opid_t family;
obj_t* alpha;
obj_t* a;
obj_t* d;
obj_t* b;
obj_t* beta;
obj_t* c;
cntx_t* cntx;
rntm_t* rntm;
dim_t tid;
thrcomm_t* gl_comm;
array_t* array;
} thread_data_t;
// Entry point function for additional threads.
void* bao_l3_thread_entry( void* data_void )
{
thread_data_t* data = data_void;
l3sbxint_t func = data->func;
opid_t family = data->family;
obj_t* alpha = data->alpha;
obj_t* a = data->a;
obj_t* d = data->d;
obj_t* b = data->b;
obj_t* beta = data->beta;
obj_t* c = data->c;
cntx_t* cntx = data->cntx;
rntm_t* rntm = data->rntm;
dim_t tid = data->tid;
array_t* array = data->array;
thrcomm_t* gl_comm = data->gl_comm;
( void )family;
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
thrinfo_t* thread = NULL;
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
func
(
alpha,
a,
d,
b,
beta,
c,
cntx,
rntm_p,
thread
);
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
return NULL;
}
void bao_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
err_t r_val;
// Query the total number of threads from the context.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_pba_rntm_set_pba( rntm );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
// Allocate an array of pthread objects and auxiliary data structs to pass
// to the thread entry functions.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val );
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
// can spawn all other threads before proceeding with its own computation.
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
{
// Set up thread data for additional threads (beyond thread 0).
datas[tid].func = func;
datas[tid].family = family;
datas[tid].alpha = alpha;
datas[tid].a = a;
datas[tid].d = d;
datas[tid].b = b;
datas[tid].beta = beta;
datas[tid].c = c;
datas[tid].cntx = cntx;
datas[tid].rntm = rntm;
datas[tid].tid = tid;
datas[tid].gl_comm = gl_comm;
datas[tid].array = array;
// Spawn additional threads for ids greater than 1.
if ( tid != 0 )
bli_pthread_create( &pthreads[tid], NULL, &bao_l3_thread_entry, &datas[tid] );
else
bao_l3_thread_entry( ( void* )(&datas[0]) );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Thread 0 waits for additional threads to finish.
for ( dim_t tid = 1; tid < n_threads; tid++ )
{
bli_pthread_join( pthreads[tid], NULL );
}
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( pthreads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( datas );
}
#endif

View File

@@ -0,0 +1,47 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
#define BLIS_SBX_L3_DECOR_PTHREADS_H
// Definitions specific to situations when POSIX multithreading is enabled.
#ifdef BLIS_ENABLE_PTHREADS
// Thread entry point prototype.
void* bao_l3_thread_entry( void* data_void );
#endif
#endif

View File

@@ -0,0 +1,143 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifndef BLIS_ENABLE_MULTITHREADING
#define SKIP_THRINFO_TREE
void bao_l3_thread_decorator
(
l3sbxint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* d,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_pba_rntm_set_pba( rntm );
#ifndef SKIP_THRINFO_TREE
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
// There is only one thread id (for the thief thread).
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
#ifndef SKIP_THRINFO_TREE
thrinfo_t* thread = NULL;
// Create the root node of the thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
#else
// This optimization allows us to use one of the global thrinfo_t
// objects for single-threaded execution rather than grow one from
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
// from within the variants, will immediately return if it detects
// that the thrinfo_t* passed into it is either
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
( void )tid;
#endif
func
(
alpha,
a,
d,
b,
beta,
c,
cntx,
rntm_p,
thread
);
#ifndef SKIP_THRINFO_TREE
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
#endif
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
#define BLIS_SBX_L3_DECOR_SINGLE_H
// Definitions specific to situations when multithreading is disabled.
#ifndef BLIS_ENABLE_MULTITHREADING
#endif
#endif

47
build/bli_addon.h.in Normal file
View File

@@ -0,0 +1,47 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_ADDON_H
#define BLIS_ADDON_H
#if @enable_addons@
#define BLIS_ENABLE_ADDONS
#else
#define BLIS_DISABLE_ADDONS
#endif
// Enabled addons
@addon_list_includes@
#endif

View File

@@ -183,6 +183,10 @@ MK_ENABLE_CBLAS := @enable_cblas@
# Whether libblis will depend on libmemkind for certain memory allocations.
MK_ENABLE_MEMKIND := @enable_memkind@
# The names of the addons to include when building BLIS. If empty, no addons
# will be included.
ADDON_LIST := @addon_list@
# The name of a sandbox defining an alternative gemm implementation. If empty,
# no sandbox will be used and the conventional gemm implementation will remain
# enabled.

126
common.mk
View File

@@ -161,18 +161,35 @@ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
# When compiling sandboxes, we use flags similar to those of general framework
# source. This ensures that the same code can be linked and run across various
# sub-configurations. (If we switch to using refkern/kernel flags, we should
# prevent enabling sandboxes for umbrella families by verifying that
# config_list == config_name if --enable-sandbox is given.)
# sub-configurations.
get-addon-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(CADDONINCFLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
get-addon-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cxxflags-for,$(1)) \
$(CADDONINCFLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
# When compiling sandboxes, we use flags similar to those of general framework
# source. This ensures that the same code can be linked and run across various
# sub-configurations. (NOTE: If we ever switch to using refkernel or kernel
# flags, we should prevent enabling sandboxes for umbrella families by verifying
# that config_list == config_name if --enable-sandbox is given. THIS ALSO
# APPLIES TO ADDONS ABOVE.)
get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(CSBOXINCFLAGS) \
$(CSANDINCFLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
$(call get-noopt-cxxflags-for,$(1)) \
$(CSBOXINCFLAGS) \
$(CSANDINCFLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)
@@ -198,6 +215,8 @@ get-config-text-for = "('$(1)' CFLAGS for config code)"
get-frame-text-for = "('$(1)' CFLAGS for framework code)"
get-aocldtl-text-for = "('$(1)' CFLAGS for AOCL debug and trace code)"
get-kernel-text-for = "('$(1)' CFLAGS for kernels)"
get-addon-c99text-for = "('$(1)' CFLAGS for addons)"
get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)"
get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)"
get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
@@ -212,6 +231,10 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),)))
files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f))))
# Define a function that removes duplicate strings *without* using the sort
# function.
rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1)))
#
# --- Include makefile configuration file --------------------------------------
@@ -297,6 +320,7 @@ FRAME_DIR := frame
AOCLDTL_DIR := aocl_dtl
REFKERN_DIR := ref_kernels
KERNELS_DIR := kernels
ADDON_DIR := addon
SANDBOX_DIR := sandbox
OBJ_DIR := obj
LIB_DIR := lib
@@ -313,12 +337,13 @@ REFNM := ref
# Source suffixes.
CONFIG_SRC_SUFS := c
KERNELS_SRC_SUFS := c s S
FRAME_SRC_SUFS := c
AOCLDTL_SRC_SUFS := c
ADDON_C99_SUFS := c
ADDON_CXX_SUFS := cc cpp cxx
ADDON_SRC_SUFS := $(ADDON_C99_SUFS) $(ADDON_CXX_SUFS)
SANDBOX_C99_SUFS := c
SANDBOX_CXX_SUFS := cc cpp cxx
@@ -328,6 +353,9 @@ SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS)
FRAME_HDR_SUFS := h
AOCLDTL_HDR_SUFS := h
ADDON_H99_SUFS := h
ADDON_HXX_SUFS := hh hpp hxx
ADDON_HDR_SUFS := $(ADDON_H99_SUFS) $(ADDON_HXX_SUFS)
SANDBOX_H99_SUFS := h
SANDBOX_HXX_SUFS := hh hpp hxx
@@ -335,10 +363,12 @@ SANDBOX_HDR_SUFS := $(SANDBOX_H99_SUFS) $(SANDBOX_HXX_SUFS)
# Combine all header suffixes and remove duplicates via sort().
ALL_HDR_SUFS := $(sort $(FRAME_HDR_SUFS) \
$(ADDON_HDR_SUFS) \
$(SANDBOX_HDR_SUFS) \
$(AOCLDTL_HDR_SUFS))
ALL_H99_SUFS := $(sort $(FRAME_HDR_SUFS) \
$(ADDON_HDR_SUFS) \
$(SANDBOX_H99_SUFS) \
$(AOCLDTL_HDR_SUFS))
@@ -366,12 +396,14 @@ SHELL := bash
# Construct paths to the four primary directories of source code:
# the config directory, general framework code, reference kernel code,
# and optimized kernel code.
# and optimized kernel code. Also process paths for addon and sandbox
# directories.
CONFIG_PATH := $(DIST_PATH)/$(CONFIG_DIR)
FRAME_PATH := $(DIST_PATH)/$(FRAME_DIR)
AOCLDTL_PATH := $(DIST_PATH)/$(AOCLDTL_DIR)
REFKERN_PATH := $(DIST_PATH)/$(REFKERN_DIR)
KERNELS_PATH := $(DIST_PATH)/$(KERNELS_DIR)
ADDON_PATH := $(DIST_PATH)/$(ADDON_DIR)
SANDBOX_PATH := $(DIST_PATH)/$(SANDBOX_DIR)
# Construct paths to some optional C++ template headers contributed by AMD.
@@ -386,6 +418,7 @@ FRAME_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(FRAME_DIR)
AOCLDTL_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(AOCLDTL_DIR)
REFKERN_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(REFKERN_DIR)
KERNELS_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(KERNELS_DIR)
ADDON_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(ADDON_DIR)
SANDBOX_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR)
@@ -863,6 +896,7 @@ MK_KERNELS_SRC :=
MK_REFKERN_SRC :=
MK_FRAME_SRC :=
MK_AOCLDTL_SRC :=
MK_ADDON_SRC :=
MK_SANDBOX_SRC :=
# -- config --
@@ -914,6 +948,24 @@ PARENT_PATH := $(OBJ_DIR)/$(CONFIG_NAME)
-include $(addsuffix /$(FRAGMENT_MK), $(FRAME_FRAG_PATH))
-include $(addsuffix /$(FRAGMENT_MK), $(AOCLDTL_FRAG_PATH))
# -- addon --
# Construct paths to each addon.
# NOTE: If $(ADDON_LIST) is empty (because no addon was enabled at configure-
# time) then $(ADDON_PATHS) will also be empty, which will cause no fragments
# to be included.
ADDON_PATHS := $(addprefix $(ADDON_FRAG_PATH)/, $(ADDON_LIST))
# This variable is used by the include statements as they recursively include
# one another. For the 'addons' directory, we initialize it to that directory
# in preparation to include the fragments in the configuration sub-directory.
PARENT_SRC_PATH := $(ADDON_PATH)
PARENT_PATH := $(ADDON_FRAG_PATH)
# Recursively include the makefile fragments in each of the addons sub-
# directories.
-include $(addsuffix /$(FRAGMENT_MK), $(ADDON_PATHS))
# -- sandbox --
# Construct paths to each sandbox. (At present, there can be only one.)
@@ -931,6 +983,8 @@ PARENT_PATH := $(SANDBOX_FRAG_PATH)
# Recursively include the makefile fragments in the sandbox sub-directory.
-include $(addsuffix /$(FRAGMENT_MK), $(SANDBOX_PATHS))
# -- post-processing --
# Create a list of the makefile fragments using the variable into which each
# of the above include statements accumulated their directory paths.
MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS))
@@ -949,14 +1003,14 @@ endif
#
# Define a function that will expand all of the directory paths given in $(1)
# to actual filepaths using the list of suffixes provided $(2).
# to actual filepaths using the list of suffixes provided in $(2).
get-filepaths = $(strip $(foreach path, $(1), \
$(foreach suf, $(2), \
$(wildcard $(path)/*.$(suf)) \
) ) )
# Define a function that will expand all of the directory paths given in $(1)
# to actual filepaths using the list of suffixes provided $(2), taking only
# to actual filepaths using the list of suffixes provided in $(2), taking only
# the first expansion from each directory with at least one file matching
# the current suffix. Finally, strip the filenames from all resulting files,
# returning only the directory paths.
@@ -966,20 +1020,29 @@ get-dirpaths = $(dir $(foreach path, $(1), \
$(wildcard $(path)/*.$(suf)) \
) ) ) )
# We'll use two directory lists. The first is a list of all of the directories
# in which makefile fragments were generated (plus the current directory). The
# second is the subset of the first that begins with the sandbox root path.
# We'll use three directory lists. The first is a list of all of the directories
# in which makefile fragments were generated, plus the current directory. (The
# current directory is needed so we include bli_config.h and bli_addon.h in the
# processing of header files.) The second and third are subsets of the first
# that begins with the addon and sandbox root paths, respectively.
ALLFRAG_DIR_PATHS := . $(FRAGMENT_DIR_PATHS)
ADDON_DIR_PATHS := $(filter $(ADDON_PATH)/%,$(ALLFRAG_DIR_PATHS))
SANDBOX_DIR_PATHS := $(filter $(SANDBOX_PATH)/%,$(ALLFRAG_DIR_PATHS))
ALL_H99_FILES := $(call get-filepaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
FRAME_H99_FILES := $(filter-out $(SANDBOX_PATH)/%,$(ALL_H99_FILES))
FRAME_H99_FILES := $(filter-out $(ADDON_PATH)/%, \
$(filter-out $(SANDBOX_PATH)/%, \
$(ALL_H99_FILES) \
) )
ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS))
SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS))
ADDON_H99_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_H99_SUFS))
ADDON_HXX_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_HXX_SUFS))
ADDON_HDR_DIRPATHS := $(call get-dirpaths,$(ADDON_DIR_PATHS),$(ALL_HDR_SUFS))
SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS))
SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS))
SANDBOX_HDR_DIRPATHS := $(call get-dirpaths,$(SANDBOX_DIR_PATHS),$(ALL_HDR_SUFS))
@@ -1032,8 +1095,8 @@ CBLAS_H_FLAT := $(BASE_INC_PATH)/$(CBLAS_H)
#
# Obtain a list of header files #included inside of the bli_cntx_ref.c file.
# Paths to these files will be needed when compiling with the monolithic
# header.
# Due to the way that bli_cntx_ref.c uses headers and macros, paths to these
# files will be needed when compiling bli_cntx_ref.c with the monolithic header.
ifeq ($(strip $(SHARE_PATH)),.)
REF_KER_SRC := $(DIST_PATH)/$(REFKERN_DIR)/bli_cntx_ref.c
REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H))
@@ -1041,9 +1104,10 @@ endif
# Match each header found above with the path to that header, and then strip
# leading, trailing, and internal whitespace.
REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \
$(dir $(filter %/$(header), \
$(FRAME_H99_FILES)))))
REF_KER_H_PATHS := $(call rm-dups,$(strip \
$(foreach header, $(REF_KER_HEADERS), \
$(dir $(filter %/$(header), \
$(FRAME_H99_FILES))))))
# Add -I to each header path so we can specify our include search paths to the
# C compiler. Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h.
@@ -1055,17 +1119,29 @@ REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include
# now #include the monolithic/flattened blis.h instead.
CINCFLAGS := -I$(BASE_INC_PATH) $(REF_KER_I_PATHS)
# If CBLAS is enabled, we also include the path to the cblas.h directory so
# that the compiler will be able to find cblas.h as the CBLAS source code is
# being compiled.
ifeq ($(MK_ENABLE_CBLAS),yes)
CINCFLAGS += -I$(CBLAS_H_DIRPATH)
endif
# Obtain a list of header paths in the configured addons. Then add -I to each
# header path.
CADDONINCFLAGS := $(strip $(patsubst %, -I%, $(ADDON_HDR_DIRPATHS)))
# Obtain a list of header paths in the configured sandbox. Then add -I to each
# header path.
CSBOXINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS)))
CSANDINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS)))
#
# --- BLIS configuration header definitions ------------------------------------
#
# This file was created by configure, but we need to define it here so we can
# remove it as part of the clean targets.
# These files were created by configure, but we need to define them here so we
# can remove them as part of the clean targets.
BLIS_ADDON_H := ./bli_addon.h
BLIS_CONFIG_H := ./bli_config.h

151
configure vendored
View File

@@ -264,6 +264,15 @@ print_usage()
echo " \"small\" depends on thresholds that may vary by sub-"
echo " configuration."
echo " "
echo " -a NAME --enable-addon=NAME"
echo " "
echo " Enable the code provided by an addon. An addon consists"
echo " of a separate directory of code that provides additional"
echo " APIs, implementations, and/or operations that would"
echo " otherwise not be present within a build of BLIS. This"
echo " option may be used multiple times to specify the inclusion"
echo " of multiple addons. By default, no addons are enabled."
echo " "
echo " -s NAME --enable-sandbox=NAME"
echo " "
echo " Enable a separate sandbox implementation of gemm. This"
@@ -940,6 +949,18 @@ canonicalize_ws()
echo "${str}"
}
rm_duplicate_words_simple()
{
local str revstr revres res
str="$1"
# Remote duplicates, keeping the first occurrence.
res=$(echo "${str}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}{printf("\n")}')
echo "${res}"
}
rm_duplicate_words()
{
local str revstr revres res
@@ -1915,6 +1936,13 @@ main()
bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}"
bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}"
# The names/paths for the template bli_addon.h.in and its instantiated
# counterpart.
bli_addon_h_in='bli_addon.h.in'
bli_addon_h_out='bli_addon.h'
bli_addon_h_in_path="${build_dirpath}/${bli_addon_h_in}"
bli_addon_h_out_path="${cur_dirpath}/${bli_addon_h_out}"
# Path to 'mirror-tree.sh' script.
mirror_tree_sh="${build_dirpath}/mirror-tree.sh"
@@ -1941,6 +1969,9 @@ main()
# The root directory of the BLIS framework.
aocldtl_dir='aocl_dtl'
aocldtl_dirpath="${dist_path}/${aocldtl_dir}"
# The names of the addons.
addon_dir='addon'
addon_dirpath="${dist_path}/${addon_dir}"
# The name of the sandbox directory.
sandbox_dir='sandbox'
@@ -2049,6 +2080,10 @@ main()
force_version='no'
complex_return='default'
# The addon flag and names.
addon_flag=''
addon_list=''
# The sandbox flag and name.
sandbox_flag=''
sandbox=''
@@ -2093,7 +2128,7 @@ main()
# Process our command line options.
unset OPTIND
while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do
while getopts ":hp:d:e:a:s:t:r:qci:b:-:" opt; do
case $opt in
-)
case "$OPTARG" in
@@ -2194,12 +2229,21 @@ main()
disable-mem-tracing)
enable_mem_tracing='no'
;;
enable-addon=*)
addon_flag=1
addon_name=${OPTARG#*=}
# Append the addon name to the list.
addon_list="${addon_list} ${addon_name}"
;;
disable-addon)
addon_flag=''
;;
enable-sandbox=*)
sandbox_flag=1
sandbox=${OPTARG#*=}
;;
disable-sandbox)
sandbox_flag=0
sandbox_flag=''
;;
int-size=*)
int_type_size=${OPTARG#*=}
@@ -2282,6 +2326,12 @@ main()
e)
export_shared=$OPTARG
;;
a)
addon_flag=1
addon_name=$OPTARG
# Append the addon name to the list.
addon_list="${addon_list} ${addon_name}"
;;
s)
sandbox_flag=1
sandbox=$OPTARG
@@ -3141,6 +3191,34 @@ main()
exit 1
fi
# Check if addons were given.
if [ -n "${addon_flag}" ]; then
# Remove duplicates in the addon list, if they exist.
addon_list=$(rm_duplicate_words_simple "${addon_list}")
echo "${script_name}: configuring with addons:"
for addon in ${addon_list}; do
echo "${script_name}: ${addon_dir}/${addon}"
addon_fullpath="${addon_dirpath}/${addon}"
if [ ! -d "${addon_fullpath}" ]; then
echo "${script_name}: requested addon sub-directory does not exist! Cannot continue."
echo "${script_name}: *** Please verify addon existence and name."
exit 1
fi
done
enable_addons_01=1
else
echo "${script_name}: configuring with no addons."
enable_addons_01=0
fi
# Check if a sandbox was given.
if [ -n "${sandbox_flag}" ]; then
@@ -3292,6 +3370,15 @@ main()
kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n"
done
# Create a list of #includes, one for each addon in addon_list.
addon_list_includes=""
for addon in ${addon_list}; do
# Create a #define and add it to the running list.
addon_header="\"${addon}.h\""
addon_list_includes="${addon_list_includes}#include ${addon_header}\n"
done
# -- Determine whether we are performing an out-of-tree build --------------
@@ -3319,7 +3406,7 @@ main()
fi
# -- Instantiate config.mk, bli_config.h files from templates --------------
# -- Instantiate config.mk file from template ------------------------------
# Begin substituting information into the config_mk_in file, outputting
# to config_mk_out.
@@ -3365,6 +3452,7 @@ main()
| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
| sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \
| sed -e "s/@addon_list@/${addon_list}/g" \
| sed -e "s/@sandbox@/${sandbox}/g" \
| sed -e "s/@enable_trsm_preinversion@/${enable_trsm_preinversion}/g" \
| sed -e "s/@enable_aocl_dynamic@/${enable_aocl_dynamic}/g" \
@@ -3373,6 +3461,7 @@ main()
| sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen}/g" \
> "${config_mk_out_path}"
# -- Instantiate bli_config.h file from template ---------------------------
# Begin substituting information into the bli_config_h_in file, outputting
# to bli_config_h_out. NOTE: We use perl instead of sed because the version
@@ -3409,6 +3498,17 @@ main()
| sed -e "s/@complex_return_intel@/${complex_return_intel01}/g" \
> "${bli_config_h_out_path}"
# -- Instantiate bli_addon.h file from template ----------------------------
# Begin substituting information into the bli_addon_h_in file, outputting
# to bli_addon_h_out. NOTE: We use perl instead of sed because the version
# of sed used on OS X is old and does not handle the '\n' character
# intuitively, which was used when constructing ${addon_list_includes}.
echo "${script_name}: creating ${bli_addon_h_out_path} from ${bli_addon_h_in_path}"
cat "${bli_addon_h_in_path}" \
| perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" \
| sed -e "s/@enable_addons@/${enable_addons_01}/g" \
> "${bli_addon_h_out_path}"
# -- Create top-level object directories -----------------------------------
@@ -3421,7 +3521,6 @@ main()
obj_config_dirpath="${base_obj_dirpath}/${config_dir}"
#echo "${script_name}: creating ${obj_config_dirpath}"
mkdir -p ${obj_config_dirpath}
for conf in ${config_list}; do
echo "${script_name}: creating ${obj_config_dirpath}/${conf}"
@@ -3431,7 +3530,6 @@ main()
obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}"
#echo "${script_name}: creating ${obj_kernels_dirpath}"
mkdir -p ${obj_kernels_dirpath}
for kern in ${kernel_list}; do
echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}"
@@ -3441,7 +3539,6 @@ main()
obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}"
#echo "${script_name}: creating ${obj_refkern_dirpath}"
mkdir -p ${obj_refkern_dirpath}
for conf in ${config_list}; do
echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}"
@@ -3460,6 +3557,18 @@ main()
echo "${script_name}: creating ${obj_frame_dirpath}"
mkdir -p ${obj_frame_dirpath}
if [ -n "${addon_flag}" ]; then
obj_addon_dirpath="${base_obj_dirpath}/${addon_dir}"
for addon in ${addon_list}; do
echo "${script_name}: creating ${obj_addon_dirpath}/${addon}"
mkdir -p ${obj_addon_dirpath}/${addon}
done
fi
if [ -n "${sandbox_flag}" ]; then
obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
@@ -3487,6 +3596,7 @@ main()
echo "${script_name}: creating ${base_lib_dirpath}"
mkdir -p ${base_lib_dirpath}
# Create include directory (if it does not already exist).
base_include_dirpath="${include_dirpath}/${config_name}"
@@ -3545,6 +3655,16 @@ main()
echo "${script_name}: mirroring ${aocldtl_dirpath} to ${obj_aocldtl_dirpath}"
${mirror_tree_sh} ${aocldtl_dirpath} ${obj_aocldtl_dirpath}
# Mirror the chosen addon source tree to its object sub-directory.
if [ -n "${addon_flag}" ]; then
for addon in ${addon_list}; do
echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}"
${mirror_tree_sh} "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}"
done
fi
# Mirror the chosen sandbox source tree to its object sub-directory.
if [ -n "${sandbox_flag}" ]; then
@@ -3643,6 +3763,25 @@ main()
${gen_make_frags_dirpath}/suffix_list \
${gen_make_frags_dirpath}/ignore_list
# Generate makefile fragments in the addon sub-directory.
if [ -n "${addon_flag}" ]; then
for addon in ${addon_list}; do
echo "${script_name}: creating makefile fragments in ${obj_addon_dirpath}/${addon}"
${gen_make_frags_sh} \
-h -r -v0 \
-o ${script_name} \
-p 'ADDON' \
${addon_dirpath}/${addon} \
${obj_addon_dirpath}/${addon} \
${gen_make_frags_dirpath}/fragment.mk \
${gen_make_frags_dirpath}/suffix_list \
${gen_make_frags_dirpath}/ignore_list
done
fi
# Generate makefile fragments in the sandbox sub-directory.
if [ -n "${sandbox_flag}" ]; then

231
docs/Addons.md Normal file
View File

@@ -0,0 +1,231 @@
## Contents
* **[Introduction](Addons.md#introduction)**
* **[Enabling addons](Addons.md#enabling-addons)**
* **[Addon rules](Addons.md#addon-rules)**
* **[Caveats](Addons.md#caveats)**
* **[Known issues](Addons.md#known-issues)**
* **[Conclusion](Addons.md#conclusion)**
## Introduction
This file briefly describes the requirements for building a custom BLIS
*addon*.
Simply put, an addon in BLIS provides additional APIs, operations, and/or
implementations that may be useful to certain users. An addon can be
thought of as a standalone extension of BLIS that does not depend on any
other addon, although addons may utilize existing functionality or kernels
within the core framework.
By definition, an addon should *never* provide APIs that conflict with
the interfaces that belong to either the [typed API](BLISTypedAPI.md) or the
[object API](BLISObjectAPI.md). Thus, you'll never have to worry about a
properly constructed (and properly functioning) addon interfering with or
otherwise changing core BLIS functionality.
How does an addon differ from a [sandbox](Sandboxes.md)? Great question!
Sometimes you want to include additional BLIS-like functionality that does
not relate directly to `gemm` or any other BLIS operation.
(By contrast, a sandbox requires you to implement `gemm` whether you want
to or not.)
Furthermore, you may wish to enable multiple addons simultaneously.
(By contrast, only one sandbox may be enabled at a time.)
Thus, the addon feature provides additional flexibility to some
users in a way that sandboxes cannot, while still providing many of the
conveniences of sandboxes.
## Enabling an addon
To enable an existing addon at configure-time, you simply specify it as an
option to `configure`. Either of the following usages are accepted:
```
$ ./configure --enable-addon=foobar auto
$ ./configure -a foobar auto
```
Here, we tell `configure` that we want to use the `foobar` addon, which
corresponds to a subdirectory of the `addon` directory named `foobar`.
(Reminder: the `auto` argument is the configuration target and
unrelated to addons.)
You may also enable multiple addons within the same build of BLIS:
```
$ ./configure -a foobar -a thing1 -a thing2 auto
```
Note that the default behavior of `configure` is that no addons are enabled.
As `configure` runs, you should get output that includes lines
similar to:
```
configure: configuring with addons:
configure: addon/foobar
configure: addon/thing1
configure: addon/thing2
```
And when you build BLIS, the addon source code will be among the last files to
be compiled:
```
Compiling obj/haswell/addon/foobar/foobar.o ('haswell' CFLAGS for addons)
Compiling obj/haswell/addon/thing1/thing1.o ('haswell' CFLAGS for addons)
Compiling obj/haswell/addon/thing1/thing1_api.o ('haswell' CFLAGS for addons)
Compiling obj/haswell/addon/thing2/thing2_api.o ('haswell' CFLAGS for addons)
...
```
That's it! After the BLIS library is built, it will contain your chosen
addons. You can always confirm this by using `nm` to confirm the presence
of your API symbols:
```
$ nm lib/haswell/libblis.a | grep foobar
foobar.o:
0000000000000000 T foobar
```
## Addon rules
Please follow these guidelines for the best developer experience when
creating addons.
1. As with sandboxes, you don't need to worry about creating makefiles. The
BLIS build system will take care of this for you. :) By configuring BLIS with
an addon enabled, `make` will scan your addon subdirectory and compile
all of its source code using similar compilation rules as were used for the rest
of the framework. In addition, the compilation command line will automatically
contain one `-I<includepath>` option for every subdirectory in your addon,
so it doesn't matter where in your addon directory hierarchy you place your
header files -- they will be found!
2. We recommend that you write your addon in C99. While you *may* use C++11
to implement your addon, you should provide a C99 wrapper API to your
implementation so that others can interface with it. There is no guarantee
that the end-user will be using a C++11 compiler, and therefore you should
limit the definitions in your addon header to those that are C99 compliant.
If you write your addon in C++11, you must use one of the BLIS-approved file
extensions for your source files (`.cc`, `.cpp`, `.cxx`) and your local
header files (`.hh`, `.hpp`, `.hxx`).
Note that `blis.h` already contains all of its definitions inside of an
`extern "C"` block, so you should be able to `#include "blis.h"` from your
C++11 source code without any issues.
3. All of your code related to the addon should reside within the named
addon directory, or some subdirectory therein. If your addon requires
new kernels, you should add kernel source code to an appropriate
microarchitecture-specific subdirectory within the top-level `kernels`
directory so that they are compiled with the correct
microarchitecture-specific optimization flags.
4. If your addon is named `foobar`, the BLIS build system will expect to
find a header called `foobar.h` somewhere in the `addon/foobar` directory
(or one of its subdirectories). This `foobar.h` header will automatically
be inlined into the monolithic `blis.h` header that is produced by the
BLIS build system. `foobar.h` may `#include` other local headers, each of
which will also (recursively) get inlined into `blis.h`. However, you may
choose to omit some local addon headers from `foobar.h.` You might do this,
for example, because those headers define things that are not needed in
order for the end user to call your addon code.
5. Your addon APIs will always be available within static library builds of
BLIS, but if you want your addon APIs to be exported as public APIs within
*shared* library builds of BLIS, you'll need to annotate the prototypes
accordingly. (BLIS makes its shared library symbols private by default; this
allows us to export only those functions that we consider to be part of the
public APIs.) This annotation can be done by prefixing function prototypes
with the `BLIS_EXPORT_ADDON` macro as follows:
```c
BLIS_EXPORT_ADDON void foobar_calc( void* a, void* b );
```
6. Do not define any symbols in your addon that conflict with any symbols within
the core framework. For example, don't define a function called `bli_copym()`
in your addon since that function is already defined within BLIS.
7. Do not define any symbols in your addon that conflict with any symbols within
the C99 standard libraries/headers. For example, don't define a function called
`printf()` since that function is already defined within the C99 standard library.
8. *Try* to not define any symbols in your addon that conflict with symbols in any
other addon, unless your addon is meant to serve as an alternative to the
conflicting addon, in which case conflicting symbol names is okay (since you
will presumably never build with both addons enabled).
9. When choosing names for your addon files, avoid source filenames that already
exist within BLIS. For example, don't name one of your files `bli_obj.c`
since that file would compile into `bli_obj.o`, which will have already been
placed into the library by the build system.
10. Similarly, avoid header filenames that already exist within BLIS or C99.
For example, don't name one of your header files `bli_obj.h` since that file
already exists in BLIS. Also, don't name one of your header files `math.h`
since that name would conflict with the `math.h` defined by C99. (This also
means you shouldn't name your addon `math` since normally that name would
require that you provide a `math.h` header inside the addon directory.)
If you follow these rules, you will be much more likely to have a pleasant
experience integrating your BLIS addon into the larger framework.
## Caveats
Notice that the BLIS addons are limited in what they can accomplish. Generally
speaking, addons cannot change existing implementations within BLIS. Instead,
addons aim to provide a way to quickly augment BLIS with additional bundles of
code that extend BLIS's set of functionality in some interesting way. If you
want to define new BLAS-like functions, but don't know where to start, creating
a new addon is an appropriate place to start experimenting. If you want to
change or refactor existing BLIS code, an addon is probably not suited for your
needs.
Another important limitation is the fact that the build system currently uses
"framework `CFLAGS`" when compiling the addon source files. These are the same
`CFLAGS` used when compiling general framework source code,
```
# Example framework CFLAGS used by 'haswell' sub-configuration
-O2 -Wall -Wno-unused-function -Wfatal-errors -fPIC -std=c99
-D_POSIX_C_SOURCE=200112L -Iinclude/haswell -I./frame/3/
-I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include
-DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden
```
which are likely more general-purpose than the `CFLAGS` used for, say,
optimized kernels or even reference kernels:
```
# Example optimized kernel CFLAGS used by 'haswell' sub-configuration
-O3 -fomit-frame-pointer -mavx2 -mfma -mfpmath=sse -march=haswell -Wall
-Wno-unused-function -Wfatal-errors -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L
-Iinclude/haswell -I./frame/3/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/
-I./frame/include -DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden
```
(To see precisely which flags are being employed for any given file, enable
verbosity at compile-time via `make V=1`.) Compiling addons with these more
versatile `CFLAGS` compiler options means that we only need to compile one
instance of each addon source file, even when targeting multiple
configurations (for example, via `./configure x86_64`). However, it also means
that addons are not ideal for microkernels, as they sometimes need additional
compiler flags in order to
yield the highest performance. If you have a new microkernel you would like to
use within an addon, you can always develop it within that addon. However,
once it is stable and ready for use by others, it's best to move the kernel(s)
to the appropriate microarchitecture-specific subdirectory of the `kernels`
directory the kernel(s). This will allow the kernel to be compiled with the
appropriate microarchitecture-specific compiler flags.
Please see the
[Configuration Guide](ConfigurationHowTo)
for more details, and when in doubt, please don't be shy about seeking
guidance from BLIS developers by opening a
[new issue](https://github.com/flame/blis/issues) or sending a message to the
[blis-devel](http://groups.google.com/d/forum/blis-devel) mailing list.
Notwithstanding these limitations, hopefully you still find BLIS addons
useful!
## Known issues
* None yet.
## Conclusion
If you encounter any problems, please open
a new [issue on GitHub](https://github.com/flame/blis/issues).
If you are unsure about how something works, you can still open an issue. Or, you
can send a message to
[blis-devel](https://groups.google.com/d/forum/blis-devel) mailing list.

View File

@@ -40,7 +40,7 @@
// This string gets defined via -D on the command line when BLIS is compiled.
// This string is (or rather, should be) only used here.
static char* bli_version_str = BLIS_VERSION_STRING;
static char* bli_version_str = "4.0"; //BLIS_VERSION_STRING;
static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
char* bli_info_get_version_str( void ) { return bli_version_str; }

View File

@@ -241,8 +241,9 @@
#endif
#endif
#define BLIS_EXPORT_BLIS BLIS_EXPORT
#define BLIS_EXPORT_BLAS BLIS_EXPORT
#define BLIS_EXPORT_BLIS BLIS_EXPORT
#define BLIS_EXPORT_BLAS BLIS_EXPORT
#define BLIS_EXPORT_ADDON BLIS_EXPORT
// -- STATIC INLINE FUNCTIONS --------------------------------------------------

View File

@@ -186,6 +186,14 @@ extern "C" {
#include "bli_util.h"
// -- addon definitions --
// NOTE: These definitions should not be included much earlier since an addon
// may wish to utilize other types and definitions provided by BLIS.
#include "bli_addon.h"
// -- sandbox implementation --
#include "bli_sbox.h"