mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Added support for addons.
Details: - Implemented a new feature called addons, which are similar to sandboxes except that there is no requirement to define gemm or any other particular operation. - Updated configure to accept --enable-addon=<name> or -a <name> syntax for requesting an addon be included within a BLIS build. configure now outputs the list of enabled addons into config.mk. It also outputs the corresponding #include directives for the addons' headers to a new companion to the bli_config.h header file named bli_addon.h. Because addons may wish to make use of existing BLIS types within their own definitions, the addons' headers must be included sometime after that of bli_config.h (which currently is #included before bli_type_defs.h). This is why the #include directives needed to go into a new top-level header file rather than the existing bli_config.h file. - Added a markdown document, docs/Addons.md, to explain addons, how to build with them, and what assumptions their authors should keep in mind as they create them. - Added a gemmlike-like implementation of sandwich gemm called 'gemmd' as an addon in addon/gemmd. The code uses a 'bao_' prefix for local functions, including the user-level object and typed APIs. - Updated .gitignore so that git ignores bli_addon.h files. Change-Id: Ie7efdea366481ce25075cb2459bdbcfd52309717
This commit is contained in:
committed by
mkadavil
parent
0792eb8608
commit
7a0ba4194f
1
.gitignore
vendored
1
.gitignore
vendored
@@ -31,6 +31,7 @@
|
||||
|
||||
config.mk
|
||||
bli_config.h
|
||||
bli_addon.h
|
||||
|
||||
# -- monolithic headers --
|
||||
|
||||
|
||||
47
Makefile
47
Makefile
@@ -116,6 +116,7 @@ BASE_OBJ_FRAME_PATH := $(BASE_OBJ_PATH)/$(FRAME_DIR)
|
||||
BASE_OBJ_AOCLDTL_PATH := $(BASE_OBJ_PATH)/$(AOCLDTL_DIR)
|
||||
BASE_OBJ_REFKERN_PATH := $(BASE_OBJ_PATH)/$(REFKERN_DIR)
|
||||
BASE_OBJ_KERNELS_PATH := $(BASE_OBJ_PATH)/$(KERNELS_DIR)
|
||||
BASE_OBJ_ADDON_PATH := $(BASE_OBJ_PATH)/$(ADDON_DIR)
|
||||
BASE_OBJ_SANDBOX_PATH := $(BASE_OBJ_PATH)/$(SANDBOX_DIR)
|
||||
|
||||
# --- Define install target names for static libraries ---
|
||||
@@ -237,6 +238,9 @@ endif
|
||||
MK_AOCLDTL_OBJS := $(call gen-obj-paths-from-src,$(AOCLDTL_SRC_SUFS),$(MK_AOCLDTL_SRC),$(AOCLDTL_PATH),$(BASE_OBJ_AOCLDTL_PATH))
|
||||
|
||||
|
||||
# Generate object file paths for the addon source code. If one or more addons
|
||||
# were not enabled a configure-time, this variable will we empty.
|
||||
MK_ADDON_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH))
|
||||
|
||||
# Generate object file paths for the sandbox source code. If a sandbox was not
|
||||
# enabled a configure-time, this variable will we empty.
|
||||
@@ -248,6 +252,7 @@ MK_BLIS_OBJS := $(MK_CONFIG_OBJS) \
|
||||
$(MK_REFKERN_OBJS) \
|
||||
$(MK_FRAME_OBJS) \
|
||||
$(MK_AOCLDTL_OBJS) \
|
||||
$(MK_ADDON_OBJS) \
|
||||
$(MK_SANDBOX_OBJS)
|
||||
|
||||
# Optionally filter out the BLAS and CBLAS compatibility layer object files.
|
||||
@@ -588,6 +593,28 @@ else
|
||||
endif
|
||||
endef
|
||||
|
||||
# first argument: a configuration name from the union of config_list and
|
||||
# config_name, used to look up the CFLAGS to use during compilation.
|
||||
define make-c99-addon-rule
|
||||
$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@
|
||||
else
|
||||
@echo "Compiling $$@" $(call get-addon-c99text-for,$(1))
|
||||
@$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@
|
||||
endif
|
||||
endef
|
||||
|
||||
define make-cxx-addon-rule
|
||||
$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@
|
||||
else
|
||||
@echo "Compiling $$@" $(call get-addon-cxxtext-for,$(1))
|
||||
@$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@
|
||||
endif
|
||||
endef
|
||||
|
||||
# first argument: a configuration name from the union of config_list and
|
||||
# config_name, used to look up the CFLAGS to use during compilation.
|
||||
define make-c99-sandbox-rule
|
||||
@@ -648,6 +675,16 @@ $(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf))))
|
||||
$(foreach suf, $(KERNELS_SRC_SUFS), \
|
||||
$(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf)))))
|
||||
|
||||
# Instantiate the build rule for C addon files. Use the CFLAGS for the
|
||||
# configuration family.
|
||||
$(foreach suf, $(ADDON_C99_SUFS), \
|
||||
$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf)))))
|
||||
|
||||
# Instantiate the build rule for C++ addon files. Use the CFLAGS for the
|
||||
# configuration family.
|
||||
$(foreach suf, $(ADDON_CXX_SUFS), \
|
||||
$(foreach conf, $(CONFIG_NAME), $(eval $(call make-cxx-addon-rule,$(conf),$(suf)))))
|
||||
|
||||
# Instantiate the build rule for C sandbox files. Use the CFLAGS for the
|
||||
# configuration family.
|
||||
$(foreach suf, $(SANDBOX_C99_SUFS), \
|
||||
@@ -1141,6 +1178,9 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(FIND) $(AOCLDTL_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
ifneq ($(ADDON_LIST),)
|
||||
- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
endif
|
||||
ifneq ($(SANDBOX),)
|
||||
- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
endif
|
||||
@@ -1155,6 +1195,10 @@ else
|
||||
@- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
@echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)"
|
||||
@- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
ifneq ($(ADDON_LIST),)
|
||||
@echo "Removing makefile fragments from $(ADDON_FRAG_PATH)"
|
||||
@- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
endif
|
||||
ifneq ($(SANDBOX),)
|
||||
@echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)"
|
||||
@- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F)
|
||||
@@ -1275,6 +1319,7 @@ endif # IS_CONFIGURED
|
||||
distclean: cleanmk cleanh cleanlib cleantest
|
||||
ifeq ($(IS_CONFIGURED),yes)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(BLIS_ADDON_H)
|
||||
- $(RM_F) $(BLIS_CONFIG_H)
|
||||
- $(RM_F) $(CONFIG_MK_FILE)
|
||||
- $(RM_F) $(PC_OUT_FILE)
|
||||
@@ -1282,6 +1327,8 @@ ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_RF) $(LIB_DIR)
|
||||
- $(RM_RF) $(INCLUDE_DIR)
|
||||
else
|
||||
@echo "Removing $(BLIS_ADDON_H)"
|
||||
@$(RM_F) $(BLIS_ADDON_H)
|
||||
@echo "Removing $(BLIS_CONFIG_H)"
|
||||
@$(RM_F) $(BLIS_CONFIG_H)
|
||||
@echo "Removing $(CONFIG_MK_FILE)"
|
||||
|
||||
88
addon/gemmd/attic/bli_gemm_ex.c
Normal file
88
addon/gemmd/attic/bli_gemm_ex.c
Normal file
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_ex
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
// A switch to easily toggle whether we use the addon implementation
|
||||
// of bao_gemmd() as the implementation for bli_gemm(). (This allows for
|
||||
// easy testing of bao_gemmd() via the testsuite.)
|
||||
if ( 1 )
|
||||
{
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
obj_t d;
|
||||
|
||||
bli_obj_create( dt, k, 1, 1, k, &d );
|
||||
bli_setv( &BLIS_ONE, &d );
|
||||
//bli_randv( &d );
|
||||
|
||||
bao_gemmd_ex( alpha, a, &d, b, beta, c, cntx, rntm );
|
||||
|
||||
bli_obj_free( &d );
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
// Obtain a valid (native) context from the gks if necessary.
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
// Check the operands.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// Invoke the operation's front end.
|
||||
bli_gemm_front
|
||||
(
|
||||
alpha, a, b, beta, c, cntx, rntm, NULL
|
||||
);
|
||||
}
|
||||
|
||||
305
addon/gemmd/bao_gemmd.c
Normal file
305
addon/gemmd/bao_gemmd.c
Normal file
@@ -0,0 +1,305 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// -- Define the gemmd operation's object API ----------------------------------
|
||||
//
|
||||
|
||||
void bao_gemmd
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c
|
||||
)
|
||||
{
|
||||
bao_gemmd_ex
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
d,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
void bao_gemmd_ex
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
// Obtain a valid (native) context from the gks if necessary.
|
||||
// NOTE: This must be done before calling the _check() function, since
|
||||
// that function assumes the context pointer is valid.
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bao_gemmd_check( alpha, a, d, b, beta, c, cntx );
|
||||
|
||||
// -- bli_gemmd_front() ----------------------------------------------------
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
|
||||
// and return early.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
|
||||
bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// Induce a transposition of A if it has its transposition property set.
|
||||
// Then clear the transposition bit in the object.
|
||||
if ( bli_obj_has_trans( &a_local ) )
|
||||
{
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
|
||||
}
|
||||
|
||||
// Induce a transposition of B if it has its transposition property set.
|
||||
// Then clear the transposition bit in the object.
|
||||
if ( bli_obj_has_trans( &b_local ) )
|
||||
{
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local );
|
||||
}
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
bli_rntm_set_ways_for_op
|
||||
(
|
||||
BLIS_GEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
rntm
|
||||
);
|
||||
|
||||
// Spawn threads (if applicable), where bao_gemmd_int() is the thread entry
|
||||
// point function for each thread. This also begins the process of creating
|
||||
// the thrinfo_t tree, which contains thread communicators.
|
||||
bao_l3_thread_decorator
|
||||
(
|
||||
bao_gemmd_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
d,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntx,
|
||||
rntm
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// -- Define the gemmd operation's thread entry point --------------------------
|
||||
//
|
||||
|
||||
void bao_gemmd_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
// In this function, we choose the gemmd implementation that is executed
|
||||
// on each thread.
|
||||
|
||||
#if 1
|
||||
// Call the block-panel algorithm that calls the kernel directly, which
|
||||
// exposes edge-case handling.
|
||||
bao_gemmd_bp_var1
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
d,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
#else
|
||||
// Call the block-panel algorithm that calls the kernel indirectly via a
|
||||
// wrapper function, which hides edge-case handling.
|
||||
bao_gemmd_bp_var2
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
d,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// -- Define the gemmd operation's typed API -----------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* d, inc_t incd, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
bli_init_once(); \
|
||||
\
|
||||
/* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on
|
||||
the macro parameter 'ch' (e.g. s, d, etc). */ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, dd, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
/* Adjust the dimensions of matrices A and B according to the transa and
|
||||
transb parameters. */ \
|
||||
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
|
||||
bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
|
||||
\
|
||||
/* Create bufferless scalar objects and attach the provided scalar pointers
|
||||
to those scalar objects. */ \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
/* Create bufferless matrix objects and attach the provided matrix pointers
|
||||
to those matrix objects. */ \
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, k, 1, d, incd, k, &dd ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
/* Set the transposition/conjugation properties of the objects for matrices
|
||||
A and B. */ \
|
||||
bli_obj_set_conjtrans( transa, &ao ); \
|
||||
bli_obj_set_conjtrans( transb, &bo ); \
|
||||
\
|
||||
/* Call the object interface. */ \
|
||||
PASTECH(bao_,opname) \
|
||||
( \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&dd, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co \
|
||||
); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemmd )
|
||||
GENTFUNC( float, s, gemmd )
|
||||
GENTFUNC( double, d, gemmd )
|
||||
GENTFUNC( scomplex, c, gemmd )
|
||||
GENTFUNC( dcomplex, z, gemmd )
|
||||
|
||||
105
addon/gemmd/bao_gemmd.h
Normal file
105
addon/gemmd/bao_gemmd.h
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// -- Prototype the gemmd operation's object API -------------------------------
|
||||
//
|
||||
|
||||
BLIS_EXPORT_ADDON void bao_gemmd
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c
|
||||
);
|
||||
|
||||
BLIS_EXPORT_ADDON void bao_gemmd_ex
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
//
|
||||
// -- Prototype the gemmd operation's thread entry point -----------------------
|
||||
//
|
||||
|
||||
void bao_gemmd_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
//
|
||||
// -- Prototype the gemmd operation's typed API --------------------------------
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* d, inc_t incd, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemmd )
|
||||
GENTPROT( float, s, gemmd )
|
||||
GENTPROT( double, d, gemmd )
|
||||
GENTPROT( scomplex, c, gemmd )
|
||||
GENTPROT( dcomplex, z, gemmd )
|
||||
|
||||
530
addon/gemmd/bao_gemmd_bp_var1.c
Normal file
530
addon/gemmd/bao_gemmd_bp_var1.c
Normal file
@@ -0,0 +1,530 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemmd_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* restrict alpha,
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a,
|
||||
void* restrict d, inc_t incd,
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* restrict cntx,
|
||||
rntm_t* restrict rntm,
|
||||
thrinfo_t* restrict thread
|
||||
);
|
||||
|
||||
//
|
||||
// -- gemmd-like block-panel algorithm (object interface) ----------------------
|
||||
//
|
||||
|
||||
// Define a function pointer array named ftypes and initialize its contents with
|
||||
// the addresses of the typed functions defined below, bao_?gemmd_bp_var1().
|
||||
static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1);
|
||||
|
||||
void bao_gemmd_bp_var1
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width( a );
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||
const inc_t rs_a = bli_obj_row_stride( a );
|
||||
const inc_t cs_a = bli_obj_col_stride( a );
|
||||
|
||||
void* restrict buf_d = bli_obj_buffer_at_off( d );
|
||||
const inc_t incd = bli_obj_vector_inc( d );
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||
const inc_t rs_b = bli_obj_row_stride( b );
|
||||
const inc_t cs_b = bli_obj_col_stride( b );
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
|
||||
|
||||
// Index into the function pointer array to extract the correct
|
||||
// typed function pointer based on the chosen datatype.
|
||||
FUNCPTR_T f = ftypes[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_d, incd,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// -- gemmd-like block-panel algorithm (typed interface) -----------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict d, inc_t incd, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||
\
|
||||
/* Query the context for the microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||
const inc_t jcstep_c = cs_c; \
|
||||
const inc_t jcstep_b = cs_b; \
|
||||
\
|
||||
const inc_t pcstep_a = cs_a; \
|
||||
const inc_t pcstep_d = incd; \
|
||||
const inc_t pcstep_b = rs_b; \
|
||||
\
|
||||
const inc_t icstep_c = rs_c; \
|
||||
const inc_t icstep_a = rs_a; \
|
||||
\
|
||||
const inc_t jrstep_c = cs_c * NR; \
|
||||
\
|
||||
const inc_t irstep_c = rs_c * MR; \
|
||||
\
|
||||
ctype* restrict a_00 = a; \
|
||||
ctype* restrict d_00 = d; \
|
||||
ctype* restrict b_00 = b; \
|
||||
ctype* restrict c_00 = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
\
|
||||
/* Make local copies of the scalars to prevent any unnecessary sharing of
|
||||
cache lines between the cores' caches. */ \
|
||||
ctype alpha_local = *alpha_cast; \
|
||||
ctype beta_local = *beta_cast; \
|
||||
ctype one_local = *PASTEMAC(ch,1); \
|
||||
ctype zero_local = *PASTEMAC(ch,0); \
|
||||
\
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. */ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree. */ \
|
||||
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
|
||||
BLIS_KC, /* 4th loop */ \
|
||||
BLIS_NO_PART, /* pack B */ \
|
||||
BLIS_MC, /* 3rd loop */ \
|
||||
BLIS_NO_PART, /* pack A */ \
|
||||
BLIS_NR, /* 2nd loop */ \
|
||||
BLIS_MR, /* 1st loop */ \
|
||||
BLIS_KR }; /* microkernel loop */ \
|
||||
\
|
||||
bszid_t* restrict bszids_jc = &bszids[0]; \
|
||||
bszid_t* restrict bszids_pc = &bszids[1]; \
|
||||
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
|
||||
bszid_t* restrict bszids_ic = &bszids[3]; \
|
||||
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
|
||||
bszid_t* restrict bszids_jr = &bszids[5]; \
|
||||
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
thrinfo_t* restrict thread_ir = NULL; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t n_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current JC block dimension. */ \
|
||||
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
|
||||
\
|
||||
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
|
||||
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
|
||||
\
|
||||
/* Compute the PC loop thread range for the current thread. */ \
|
||||
const dim_t pc_start = 0, pc_end = k; \
|
||||
const dim_t k_local = k; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the PC loop. */ \
|
||||
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
|
||||
const dim_t pc_left = k_local % KC; \
|
||||
\
|
||||
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current PC block dimension. */ \
|
||||
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
|
||||
\
|
||||
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
|
||||
ctype* restrict d_pc = d_00 + pp * pcstep_d; \
|
||||
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
|
||||
\
|
||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. Then call the packm implementation. */ \
|
||||
PASTECH2(bao_,ch,packm_b) \
|
||||
( \
|
||||
conjb, \
|
||||
KC, NC, \
|
||||
kc_cur, nc_cur, NR, \
|
||||
&one_local, \
|
||||
d_pc, incd, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias b_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_pc_use = b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t m_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current IC block dimension. */ \
|
||||
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
|
||||
\
|
||||
ctype* restrict a_ic = a_pc + ii * icstep_a; \
|
||||
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. Then call the packm implementation. */ \
|
||||
PASTECH2(bao_,ch,packm_a) \
|
||||
( \
|
||||
conja, \
|
||||
MC, KC, \
|
||||
mc_cur, kc_cur, MR, \
|
||||
&one_local, \
|
||||
d_pc, incd, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
ctype* restrict a_ic_use = a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the JR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of B. */ \
|
||||
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
|
||||
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
|
||||
{ \
|
||||
const dim_t nr_cur \
|
||||
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||
\
|
||||
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
|
||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||
\
|
||||
/* Assume for now that our next panel of B to be the current panel
|
||||
of B. */ \
|
||||
ctype* restrict b2 = b_jr; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. */ \
|
||||
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the IR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of A. */ \
|
||||
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
|
||||
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IR loop. */ \
|
||||
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
||||
dim_t ir_left = mc_cur % MR; \
|
||||
\
|
||||
/* Compute the IR loop thread range for the current thread. */ \
|
||||
dim_t ir_start, ir_end; \
|
||||
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
|
||||
{ \
|
||||
const dim_t mr_cur \
|
||||
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
|
||||
\
|
||||
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
|
||||
ctype* restrict c_ir = c_jr + i * irstep_c; \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next micropanels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_ic_use; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_pc_use; \
|
||||
} \
|
||||
\
|
||||
/* Save the addresses of next micropanels of A and B to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( mr_cur == MR && nr_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
&alpha_local, \
|
||||
a_ir, \
|
||||
b_jr, \
|
||||
beta_use, \
|
||||
c_ir, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
&alpha_local, \
|
||||
a_ir, \
|
||||
b_jr, \
|
||||
&zero_local, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn) \
|
||||
( \
|
||||
mr_cur, \
|
||||
nr_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_use, \
|
||||
c_ir, rs_c, cs_c \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* This barrier is needed to prevent threads from starting to pack
|
||||
the next row panel of B before the current row panel is fully
|
||||
computed upon. */ \
|
||||
bli_thread_barrier( thread_pb ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Release any memory that was acquired for packing matrices A and B. */ \
|
||||
PASTECH2(bao_,ch,packm_finalize_mem_a) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
PASTECH2(bao_,ch,packm_finalize_mem_b) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemmd_bp_var1 )
|
||||
GENTFUNC( float, s, gemmd_bp_var1 )
|
||||
GENTFUNC( double, d, gemmd_bp_var1 )
|
||||
GENTFUNC( scomplex, c, gemmd_bp_var1 )
|
||||
GENTFUNC( dcomplex, z, gemmd_bp_var1 )
|
||||
|
||||
602
addon/gemmd/bao_gemmd_bp_var2.c
Normal file
602
addon/gemmd/bao_gemmd_bp_var2.c
Normal file
@@ -0,0 +1,602 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemmd_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* restrict alpha,
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a,
|
||||
void* restrict d, inc_t incd,
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* restrict cntx,
|
||||
rntm_t* restrict rntm,
|
||||
thrinfo_t* restrict thread
|
||||
);
|
||||
|
||||
//
|
||||
// -- gemmd-like block-panel algorithm (object interface) ----------------------
|
||||
//
|
||||
|
||||
// Define a function pointer array named ftypes and initialize its contents with
|
||||
// the addresses of the typed functions defined below, bao_?gemmd_bp_var2().
|
||||
static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var2);
|
||||
|
||||
void bao_gemmd_bp_var2
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width( a );
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||
const inc_t rs_a = bli_obj_row_stride( a );
|
||||
const inc_t cs_a = bli_obj_col_stride( a );
|
||||
|
||||
void* restrict buf_d = bli_obj_buffer_at_off( d );
|
||||
const inc_t incd = bli_obj_vector_inc( d );
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||
const inc_t rs_b = bli_obj_row_stride( b );
|
||||
const inc_t cs_b = bli_obj_col_stride( b );
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
|
||||
|
||||
// Index into the function pointer array to extract the correct
|
||||
// typed function pointer based on the chosen datatype.
|
||||
FUNCPTR_T f = ftypes[dt];
|
||||
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_d, incd,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// -- gemmd-like block-panel algorithm (typed interface) -----------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict d, inc_t incd, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||
\
|
||||
/* Query the context for the microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
/*
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
*/ \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
/*
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
*/ \
|
||||
\
|
||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||
const inc_t jcstep_c = cs_c; \
|
||||
const inc_t jcstep_b = cs_b; \
|
||||
\
|
||||
const inc_t pcstep_a = cs_a; \
|
||||
const inc_t pcstep_d = incd; \
|
||||
const inc_t pcstep_b = rs_b; \
|
||||
\
|
||||
const inc_t icstep_c = rs_c; \
|
||||
const inc_t icstep_a = rs_a; \
|
||||
\
|
||||
const inc_t jrstep_c = cs_c * NR; \
|
||||
\
|
||||
const inc_t irstep_c = rs_c * MR; \
|
||||
\
|
||||
ctype* restrict a_00 = a; \
|
||||
ctype* restrict d_00 = d; \
|
||||
ctype* restrict b_00 = b; \
|
||||
ctype* restrict c_00 = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
\
|
||||
/* Make local copies of the scalars to prevent any unnecessary sharing of
|
||||
cache lines between the cores' caches. */ \
|
||||
ctype alpha_local = *alpha_cast; \
|
||||
ctype beta_local = *beta_cast; \
|
||||
ctype one_local = *PASTEMAC(ch,1); \
|
||||
/*ctype zero_local = *PASTEMAC(ch,0);*/ \
|
||||
\
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. */ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree. */ \
|
||||
bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \
|
||||
BLIS_KC, /* 4th loop */ \
|
||||
BLIS_NO_PART, /* pack B */ \
|
||||
BLIS_MC, /* 3rd loop */ \
|
||||
BLIS_NO_PART, /* pack A */ \
|
||||
BLIS_NR, /* 2nd loop */ \
|
||||
BLIS_MR, /* 1st loop */ \
|
||||
BLIS_KR }; /* microkernel loop */ \
|
||||
\
|
||||
bszid_t* restrict bszids_jc = &bszids[0]; \
|
||||
bszid_t* restrict bszids_pc = &bszids[1]; \
|
||||
/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
|
||||
bszid_t* restrict bszids_ic = &bszids[3]; \
|
||||
/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
|
||||
bszid_t* restrict bszids_jr = &bszids[5]; \
|
||||
/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
thrinfo_t* restrict thread_ir = NULL; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t n_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current JC block dimension. */ \
|
||||
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
|
||||
\
|
||||
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
|
||||
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
|
||||
\
|
||||
/* Compute the PC loop thread range for the current thread. */ \
|
||||
const dim_t pc_start = 0, pc_end = k; \
|
||||
const dim_t k_local = k; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the PC loop. */ \
|
||||
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
|
||||
const dim_t pc_left = k_local % KC; \
|
||||
\
|
||||
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current PC block dimension. */ \
|
||||
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
|
||||
\
|
||||
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
|
||||
ctype* restrict d_pc = d_00 + pp * pcstep_d; \
|
||||
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
|
||||
\
|
||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_pc ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. Then call the packm implementation. */ \
|
||||
PASTECH2(bao_,ch,packm_b) \
|
||||
( \
|
||||
conjb, \
|
||||
KC, NC, \
|
||||
kc_cur, nc_cur, NR, \
|
||||
&one_local, \
|
||||
d_pc, incd, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias b_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_pc_use = b_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t m_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current IC block dimension. */ \
|
||||
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
|
||||
\
|
||||
ctype* restrict a_ic = a_pc + ii * icstep_a; \
|
||||
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. Note that the thrinfo_t
|
||||
node will have already been created by a previous call to
|
||||
bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
|
||||
cause the tree to grow by two (e.g. to the next bszid that is
|
||||
a normal bszid_t value). */ \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_ic ); \
|
||||
/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. Then call the packm implementation. */ \
|
||||
PASTECH2(bao_,ch,packm_a) \
|
||||
( \
|
||||
conja, \
|
||||
MC, KC, \
|
||||
mc_cur, kc_cur, MR, \
|
||||
&one_local, \
|
||||
d_pc, incd, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
ctype* restrict a_ic_use = a_use; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the JR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of B. */ \
|
||||
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
|
||||
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
|
||||
{ \
|
||||
const dim_t nr_cur \
|
||||
= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||
\
|
||||
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
|
||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||
\
|
||||
/* Assume for now that our next panel of B to be the current panel
|
||||
of B. */ \
|
||||
ctype* restrict b2 = b_jr; \
|
||||
\
|
||||
/* Identify the current thrinfo_t node. */ \
|
||||
thread_ir = bli_thrinfo_sub_node( thread_jr ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for the IR loop.
|
||||
NOTE: These values are only needed when computing the next
|
||||
micropanel of A. */ \
|
||||
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
|
||||
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IR loop. */ \
|
||||
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
||||
dim_t ir_left = mc_cur % MR; \
|
||||
\
|
||||
/* Compute the IR loop thread range for the current thread. */ \
|
||||
dim_t ir_start, ir_end; \
|
||||
bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( dim_t i = ir_start; i < ir_end; i += 1 ) \
|
||||
{ \
|
||||
const dim_t mr_cur \
|
||||
= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
|
||||
\
|
||||
ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
|
||||
ctype* restrict c_ir = c_jr + i * irstep_c; \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next micropanels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_ic_use; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_pc_use; \
|
||||
} \
|
||||
\
|
||||
/* Save the addresses of next micropanels of A and B to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Call a wrapper to the kernel (which handles edge cases). */ \
|
||||
PASTECH2(bao_,ch,gemm_kernel) \
|
||||
( \
|
||||
MR, \
|
||||
NR, \
|
||||
mr_cur, \
|
||||
nr_cur, \
|
||||
kc_cur, \
|
||||
&alpha_local, \
|
||||
a_ir, rs_a_use, cs_a_use, \
|
||||
b_jr, rs_b_use, cs_b_use, \
|
||||
beta_use, \
|
||||
c_ir, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* This barrier is needed to prevent threads from starting to pack
|
||||
the next row panel of B before the current row panel is fully
|
||||
computed upon. */ \
|
||||
bli_thread_barrier( thread_pb ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Release any memory that was acquired for packing matrices A and B. */ \
|
||||
PASTECH2(bao_,ch,packm_finalize_mem_a) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
PASTECH2(bao_,ch,packm_finalize_mem_b) \
|
||||
( \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemmd_bp_var2 )
|
||||
GENTFUNC( float, s, gemmd_bp_var2 )
|
||||
GENTFUNC( double, d, gemmd_bp_var2 )
|
||||
GENTFUNC( scomplex, c, gemmd_bp_var2 )
|
||||
GENTFUNC( dcomplex, z, gemmd_bp_var2 )
|
||||
|
||||
//
|
||||
// -- gemm-like microkernel wrapper --------------------------------------------
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
const dim_t MR, \
|
||||
const dim_t NR, \
|
||||
dim_t mr_cur, \
|
||||
dim_t nr_cur, \
|
||||
dim_t kc_cur, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict aux, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
/* Infer the datatype from the ctype. */ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for the microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype zero = *PASTEMAC(ch,0); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( mr_cur == MR && nr_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
beta, \
|
||||
c, rs_c, cs_c, \
|
||||
aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm microkernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
kc_cur, \
|
||||
alpha, \
|
||||
a, \
|
||||
b, \
|
||||
&zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn) \
|
||||
( \
|
||||
mr_cur, \
|
||||
nr_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta, \
|
||||
c, rs_c, cs_c \
|
||||
); \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( gemm_kernel )
|
||||
GENTFUNC( float, s, gemm_kernel )
|
||||
GENTFUNC( double, d, gemm_kernel )
|
||||
GENTFUNC( scomplex, c, gemm_kernel )
|
||||
GENTFUNC( dcomplex, z, gemm_kernel )
|
||||
|
||||
131
addon/gemmd/bao_gemmd_check.c
Normal file
131
addon/gemmd/bao_gemmd_check.c
Normal file
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bao_gemmd_check
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
err_t e_val;
|
||||
|
||||
// Check object datatypes.
|
||||
|
||||
e_val = bli_check_noninteger_object( alpha );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_noninteger_object( beta );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_floating_object( a );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_floating_object( d );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_floating_object( b );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_floating_object( c );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
// Check scalar/vector/matrix type.
|
||||
|
||||
e_val = bli_check_scalar_object( alpha );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_scalar_object( beta );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_matrix_object( a );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_vector_object( d );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_matrix_object( b );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_matrix_object( c );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
// Check object buffers (for non-NULLness).
|
||||
|
||||
e_val = bli_check_object_buffer( alpha );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_object_buffer( a );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_object_buffer( d );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_object_buffer( b );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_object_buffer( beta );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_object_buffer( c );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
// Check object dimensions.
|
||||
|
||||
e_val = bli_check_level3_dims( a, b, c );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
// Check for consistent datatypes.
|
||||
// NOTE: We only perform these tests when mixed datatype support is
|
||||
// disabled.
|
||||
|
||||
e_val = bli_check_consistent_object_datatypes( c, a );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_consistent_object_datatypes( c, d );
|
||||
bli_check_error_code( e_val );
|
||||
|
||||
e_val = bli_check_consistent_object_datatypes( c, b );
|
||||
bli_check_error_code( e_val );
|
||||
}
|
||||
|
||||
50
addon/gemmd/bao_gemmd_check.h
Normal file
50
addon/gemmd/bao_gemmd_check.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based check functions.
|
||||
//
|
||||
|
||||
void bao_gemmd_check
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
126
addon/gemmd/bao_gemmd_var.h
Normal file
126
addon/gemmd/bao_gemmd_var.h
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype the object-based variant interfaces.
|
||||
//
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTECH(bao_,opname) \
|
||||
( \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* d, \
|
||||
obj_t* b, \
|
||||
obj_t* beta, \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
GENPROT( gemmd_bp_var1 )
|
||||
GENPROT( gemmd_bp_var2 )
|
||||
|
||||
|
||||
//
|
||||
// Prototype the typed variant interfaces.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict d, inc_t incd, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemmd_bp_var1 )
|
||||
GENTPROT( float, s, gemmd_bp_var1 )
|
||||
GENTPROT( double, d, gemmd_bp_var1 )
|
||||
GENTPROT( scomplex, c, gemmd_bp_var1 )
|
||||
GENTPROT( dcomplex, z, gemmd_bp_var1 )
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemmd_bp_var2 )
|
||||
GENTPROT( float, s, gemmd_bp_var2 )
|
||||
GENTPROT( double, d, gemmd_bp_var2 )
|
||||
GENTPROT( scomplex, c, gemmd_bp_var2 )
|
||||
GENTPROT( dcomplex, z, gemmd_bp_var2 )
|
||||
|
||||
|
||||
//
|
||||
// Prototype the typed kernel interfaces.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
const dim_t MR, \
|
||||
const dim_t NR, \
|
||||
dim_t mr_cur, \
|
||||
dim_t nr_cur, \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict aux, \
|
||||
cntx_t* restrict cntx \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( gemm_kernel )
|
||||
GENTPROT( float, s, gemm_kernel )
|
||||
GENTPROT( double, d, gemm_kernel )
|
||||
GENTPROT( scomplex, c, gemm_kernel )
|
||||
GENTPROT( dcomplex, z, gemm_kernel )
|
||||
|
||||
330
addon/gemmd/bao_l3_packm_a.c
Normal file
330
addon/gemmd/bao_l3_packm_a.c
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
/* Set the pack buffer type so that we are obtaining memory blocks from
|
||||
the pool dedicated to blocks of A. */ \
|
||||
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
|
||||
\
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
|
||||
const dim_t k_pack = k; \
|
||||
\
|
||||
/* Barrier to make sure all threads are caught up and ready to begin the
|
||||
packm stage. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
\
|
||||
/* Compute the size of the memory block eneded. */ \
|
||||
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
|
||||
\
|
||||
/* Check the mem_t entry provided by the caller. If it is unallocated,
|
||||
then we need to acquire a block from the packed block allocator. */ \
|
||||
if ( bli_mem_is_unalloc( mem ) ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Acquire directly to the chief thread's mem_t that was passed in.
|
||||
It needs to be that mem_t struct, and not a local (temporary)
|
||||
mem_t, since there is no barrier until after packing is finished,
|
||||
which could allow a race condition whereby the chief thread exits
|
||||
the current function before the other threads have a chance to
|
||||
copy from it. (A barrier would fix that race condition, but then
|
||||
again, I prefer to keep barriers to a minimum.) */ \
|
||||
bli_pba_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t to all
|
||||
threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_mem_is_alloc( mem ) ) */ \
|
||||
{ \
|
||||
/* If the mem_t entry provided by the caller does NOT contain a NULL
|
||||
buffer, then a block has already been acquired from the packed
|
||||
block allocator and cached by the caller. */ \
|
||||
\
|
||||
/* As a sanity check, we should make sure that the mem_t object isn't
|
||||
associated with a block that is too small compared to the size of
|
||||
the packed matrix buffer that is needed, according to the value
|
||||
computed above. */ \
|
||||
siz_t mem_size = bli_mem_size( mem ); \
|
||||
\
|
||||
if ( mem_size < size_needed ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* The chief thread releases the existing block associated
|
||||
with the mem_t, and then re-acquires a new block, saving
|
||||
the associated mem_t to its passed-in mem_t. (See coment
|
||||
above for why the acquisition needs to be directly to
|
||||
the chief thread's passed-in mem_t and not a local
|
||||
(temporary) mem_t. */ \
|
||||
bli_pba_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
bli_pba_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the mem_t entry is already allocated and sufficiently large,
|
||||
then we use it as-is. No action is needed. */ \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
|
||||
GENTFUNC( float, s, packm_init_mem_a )
|
||||
GENTFUNC( double, d, packm_init_mem_a )
|
||||
GENTFUNC( scomplex, c, packm_init_mem_a )
|
||||
GENTFUNC( dcomplex, z, packm_init_mem_a )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
if ( thread != NULL ) \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Check the mem_t entry provided by the caller. Only proceed if it
|
||||
is allocated, which it should be. */ \
|
||||
if ( bli_mem_is_alloc( mem ) ) \
|
||||
{ \
|
||||
bli_pba_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
|
||||
GENTFUNC( float, s, packm_finalize_mem_a )
|
||||
GENTFUNC( double, d, packm_finalize_mem_a )
|
||||
GENTFUNC( scomplex, c, packm_finalize_mem_a )
|
||||
GENTFUNC( dcomplex, z, packm_finalize_mem_a )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
dim_t* restrict m_max, \
|
||||
dim_t* restrict k_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
|
||||
*k_max = k; \
|
||||
\
|
||||
/* Determine the dimensions and strides for the packed matrix A. */ \
|
||||
{ \
|
||||
/* Pack A to column-stored row-panels. */ \
|
||||
*rs_p = 1; \
|
||||
*cs_p = mr; \
|
||||
\
|
||||
*pd_p = mr; \
|
||||
*ps_p = mr * k; \
|
||||
\
|
||||
/* Set the schema to "packed row panels" to indicate packing to
|
||||
conventional column-stored row panels. */ \
|
||||
*schema = BLIS_PACKED_ROW_PANELS; \
|
||||
} \
|
||||
\
|
||||
/* Set the buffer address provided by the caller to point to the memory
|
||||
associated with the mem_t entry acquired from the memory pool. */ \
|
||||
*p = bli_mem_buffer( mem ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_a )
|
||||
GENTFUNC( float, s, packm_init_a )
|
||||
GENTFUNC( double, d, packm_init_a )
|
||||
GENTFUNC( scomplex, c, packm_init_a )
|
||||
GENTFUNC( dcomplex, z, packm_init_a )
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces to the variant chooser.
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t m_alloc, \
|
||||
dim_t k_alloc, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict d, inc_t incd, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
pack_t schema; \
|
||||
dim_t m_max; \
|
||||
dim_t k_max; \
|
||||
dim_t pd_p; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. */ \
|
||||
PASTECH2(bao_,ch,packm_init_mem_a) \
|
||||
( \
|
||||
m_alloc, k_alloc, mr, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
mem, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix A. */ \
|
||||
PASTECH2(bao_,ch,packm_init_a) \
|
||||
( \
|
||||
&schema, \
|
||||
m, k, mr, \
|
||||
&m_max, &k_max, \
|
||||
p, rs_p, cs_p, \
|
||||
&pd_p, ps_p, \
|
||||
mem \
|
||||
); \
|
||||
\
|
||||
/* Pack matrix A to the destination buffer chosen above. Here, the packed
|
||||
matrix is stored to column-stored MR x k micropanels. */ \
|
||||
PASTECH2(bao_,ch,packm_var1) \
|
||||
( \
|
||||
conj, \
|
||||
schema, \
|
||||
m, \
|
||||
k, \
|
||||
m_max, \
|
||||
k_max, \
|
||||
kappa, \
|
||||
d, incd, \
|
||||
a, rs_a, cs_a, \
|
||||
*p, *rs_p, *cs_p, \
|
||||
pd_p, *ps_p, \
|
||||
cntx, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Barrier so that packing is done before computation. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_a )
|
||||
GENTFUNC( float, s, packm_a )
|
||||
GENTFUNC( double, d, packm_a )
|
||||
GENTFUNC( scomplex, c, packm_a )
|
||||
GENTFUNC( dcomplex, z, packm_a )
|
||||
|
||||
123
addon/gemmd/bao_l3_packm_a.h
Normal file
123
addon/gemmd/bao_l3_packm_a.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_mem_a )
|
||||
GENTPROT( float, s, packm_init_mem_a )
|
||||
GENTPROT( double, d, packm_init_mem_a )
|
||||
GENTPROT( scomplex, c, packm_init_mem_a )
|
||||
GENTPROT( dcomplex, z, packm_init_mem_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
|
||||
GENTPROT( float, s, packm_finalize_mem_a )
|
||||
GENTPROT( double, d, packm_finalize_mem_a )
|
||||
GENTPROT( scomplex, c, packm_finalize_mem_a )
|
||||
GENTPROT( dcomplex, z, packm_finalize_mem_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
dim_t* restrict m_max, \
|
||||
dim_t* restrict k_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_a )
|
||||
GENTPROT( float, s, packm_init_a )
|
||||
GENTPROT( double, d, packm_init_a )
|
||||
GENTPROT( scomplex, c, packm_init_a )
|
||||
GENTPROT( dcomplex, z, packm_init_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t m_alloc, \
|
||||
dim_t k_alloc, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict d, inc_t incd, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_a )
|
||||
GENTPROT( float, s, packm_a )
|
||||
GENTPROT( double, d, packm_a )
|
||||
GENTPROT( scomplex, c, packm_a )
|
||||
GENTPROT( dcomplex, z, packm_a )
|
||||
|
||||
330
addon/gemmd/bao_l3_packm_b.c
Normal file
330
addon/gemmd/bao_l3_packm_b.c
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
/* Set the pack buffer type so that we are obtaining memory blocks from
|
||||
the pool dedicated to panels of B. */ \
|
||||
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \
|
||||
\
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
const dim_t k_pack = k; \
|
||||
const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
|
||||
\
|
||||
/* Barrier to make sure all threads are caught up and ready to begin the
|
||||
packm stage. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
\
|
||||
/* Compute the size of the memory block eneded. */ \
|
||||
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
|
||||
\
|
||||
/* Check the mem_t entry provided by the caller. If it is unallocated,
|
||||
then we need to acquire a block from the packed block allocator. */ \
|
||||
if ( bli_mem_is_unalloc( mem ) ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Acquire directly to the chief thread's mem_t that was passed in.
|
||||
It needs to be that mem_t struct, and not a local (temporary)
|
||||
mem_t, since there is no barrier until after packing is finished,
|
||||
which could allow a race condition whereby the chief thread exits
|
||||
the current function before the other threads have a chance to
|
||||
copy from it. (A barrier would fix that race condition, but then
|
||||
again, I prefer to keep barriers to a minimum.) */ \
|
||||
bli_pba_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t to all
|
||||
threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_mem_is_alloc( mem ) ) */ \
|
||||
{ \
|
||||
/* If the mem_t entry provided by the caller does NOT contain a NULL
|
||||
buffer, then a block has already been acquired from the packed
|
||||
block allocator and cached by the caller. */ \
|
||||
\
|
||||
/* As a sanity check, we should make sure that the mem_t object isn't
|
||||
associated with a block that is too small compared to the size of
|
||||
the packed matrix buffer that is needed, according to the value
|
||||
computed above. */ \
|
||||
siz_t mem_size = bli_mem_size( mem ); \
|
||||
\
|
||||
if ( mem_size < size_needed ) \
|
||||
{ \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* The chief thread releases the existing block associated
|
||||
with the mem_t, and then re-acquires a new block, saving
|
||||
the associated mem_t to its passed-in mem_t. (See coment
|
||||
above for why the acquisition needs to be directly to
|
||||
the chief thread's passed-in mem_t and not a local
|
||||
(temporary) mem_t. */ \
|
||||
bli_pba_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
bli_pba_acquire_m \
|
||||
( \
|
||||
rntm, \
|
||||
size_needed, \
|
||||
pack_buf_type, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
chief thread already has the mem_t, so it does not need to
|
||||
perform any copy.) */ \
|
||||
if ( !bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
*mem = *mem_p; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the mem_t entry is already allocated and sufficiently large,
|
||||
then we use it as-is. No action is needed. */ \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_mem_b )
|
||||
GENTFUNC( float, s, packm_init_mem_b )
|
||||
GENTFUNC( double, d, packm_init_mem_b )
|
||||
GENTFUNC( scomplex, c, packm_init_mem_b )
|
||||
GENTFUNC( dcomplex, z, packm_init_mem_b )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
if ( thread != NULL ) \
|
||||
if ( bli_thread_am_ochief( thread ) ) \
|
||||
{ \
|
||||
/* Check the mem_t entry provided by the caller. Only proceed if it
|
||||
is allocated, which it should be. */ \
|
||||
if ( bli_mem_is_alloc( mem ) ) \
|
||||
{ \
|
||||
bli_pba_release \
|
||||
( \
|
||||
rntm, \
|
||||
mem \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
|
||||
GENTFUNC( float, s, packm_finalize_mem_b )
|
||||
GENTFUNC( double, d, packm_finalize_mem_b )
|
||||
GENTFUNC( scomplex, c, packm_finalize_mem_b )
|
||||
GENTFUNC( dcomplex, z, packm_finalize_mem_b )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
dim_t* restrict k_max, \
|
||||
dim_t* restrict n_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
) \
|
||||
{ \
|
||||
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
|
||||
we NEED that last micropanel to have the same ldim (cs_p) as the other
|
||||
micropanels. Why? Because the microkernel assumes that the register (MR,
|
||||
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
|
||||
*k_max = k; \
|
||||
*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
|
||||
\
|
||||
/* Determine the dimensions and strides for the packed matrix B. */ \
|
||||
{ \
|
||||
/* Pack B to row-stored column-panels. */ \
|
||||
*rs_p = nr; \
|
||||
*cs_p = 1; \
|
||||
\
|
||||
*pd_p = nr; \
|
||||
*ps_p = k * nr; \
|
||||
\
|
||||
/* Set the schema to "packed column panels" to indicate packing to
|
||||
conventional row-stored column panels. */ \
|
||||
*schema = BLIS_PACKED_COL_PANELS; \
|
||||
} \
|
||||
\
|
||||
/* Set the buffer address provided by the caller to point to the memory
|
||||
associated with the mem_t entry acquired from the memory pool. */ \
|
||||
*p = bli_mem_buffer( mem ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_init_b )
|
||||
GENTFUNC( float, s, packm_init_b )
|
||||
GENTFUNC( double, d, packm_init_b )
|
||||
GENTFUNC( scomplex, c, packm_init_b )
|
||||
GENTFUNC( dcomplex, z, packm_init_b )
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces to the variant chooser.
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t k_alloc, \
|
||||
dim_t n_alloc, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict d, inc_t incd, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
pack_t schema; \
|
||||
dim_t k_max; \
|
||||
dim_t n_max; \
|
||||
dim_t pd_p; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. */ \
|
||||
PASTECH2(bao_,ch,packm_init_mem_b) \
|
||||
( \
|
||||
k_alloc, n_alloc, nr, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
mem, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix B. */ \
|
||||
PASTECH2(bao_,ch,packm_init_b) \
|
||||
( \
|
||||
&schema, \
|
||||
k, n, nr, \
|
||||
&k_max, &n_max, \
|
||||
p, rs_p, cs_p, \
|
||||
&pd_p, ps_p, \
|
||||
mem \
|
||||
); \
|
||||
\
|
||||
/* Pack matrix B to the destination buffer chosen above. Here, the packed
|
||||
matrix is stored to row-stored k x NR micropanels. */ \
|
||||
PASTECH2(bao_,ch,packm_var1) \
|
||||
( \
|
||||
conj, \
|
||||
schema, \
|
||||
k, \
|
||||
n, \
|
||||
k_max, \
|
||||
n_max, \
|
||||
kappa, \
|
||||
d, incd, \
|
||||
b, rs_b, cs_b, \
|
||||
*p, *rs_p, *cs_p, \
|
||||
pd_p, *ps_p, \
|
||||
cntx, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Barrier so that packing is done before computation. */ \
|
||||
bli_thread_barrier( thread ); \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_b )
|
||||
GENTFUNC( float, s, packm_b )
|
||||
GENTFUNC( double, d, packm_b )
|
||||
GENTFUNC( scomplex, c, packm_b )
|
||||
GENTFUNC( dcomplex, z, packm_b )
|
||||
|
||||
123
addon/gemmd/bao_l3_packm_b.h
Normal file
123
addon/gemmd/bao_l3_packm_b.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_mem_b )
|
||||
GENTPROT( float, s, packm_init_mem_b )
|
||||
GENTPROT( double, d, packm_init_mem_b )
|
||||
GENTPROT( scomplex, c, packm_init_mem_b )
|
||||
GENTPROT( dcomplex, z, packm_init_mem_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
|
||||
GENTPROT( float, s, packm_finalize_mem_b )
|
||||
GENTPROT( double, d, packm_finalize_mem_b )
|
||||
GENTPROT( scomplex, c, packm_finalize_mem_b )
|
||||
GENTPROT( dcomplex, z, packm_finalize_mem_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
pack_t* restrict schema, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
dim_t* restrict k_max, \
|
||||
dim_t* restrict n_max, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
mem_t* restrict mem \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_init_b )
|
||||
GENTPROT( float, s, packm_init_b )
|
||||
GENTPROT( double, d, packm_init_b )
|
||||
GENTPROT( scomplex, c, packm_init_b )
|
||||
GENTPROT( dcomplex, z, packm_init_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
conj_t conj, \
|
||||
dim_t k_alloc, \
|
||||
dim_t n_alloc, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict d, inc_t incd, \
|
||||
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_b )
|
||||
GENTPROT( float, s, packm_b )
|
||||
GENTPROT( double, d, packm_b )
|
||||
GENTPROT( scomplex, c, packm_b )
|
||||
GENTPROT( dcomplex, z, packm_b )
|
||||
|
||||
69
addon/gemmd/bao_l3_packm_var.h
Normal file
69
addon/gemmd/bao_l3_packm_var.h
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces to the variants.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict d, inc_t incd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_var1 )
|
||||
GENTPROT( float, s, packm_var1 )
|
||||
GENTPROT( double, d, packm_var1 )
|
||||
GENTPROT( scomplex, c, packm_var1 )
|
||||
GENTPROT( dcomplex, z, packm_var1 )
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_var2 )
|
||||
GENTPROT( float, s, packm_var2 )
|
||||
GENTPROT( double, d, packm_var2 )
|
||||
GENTPROT( scomplex, c, packm_var2 )
|
||||
GENTPROT( dcomplex, z, packm_var2 )
|
||||
195
addon/gemmd/bao_l3_packm_var1.c
Normal file
195
addon/gemmd/bao_l3_packm_var1.c
Normal file
@@ -0,0 +1,195 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// Variant 1 provides basic support for packing by calling packm_cxk().
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict d, inc_t incd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic; \
|
||||
dim_t ic0; \
|
||||
doff_t ic_inc; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_dim_max; \
|
||||
inc_t incc; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
bool row_stored = bli_is_col_packed( schema ); \
|
||||
/*bool col_stored = bli_is_row_packed( schema );*/ \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
} \
|
||||
\
|
||||
ctype* restrict p_begin = p_cast; \
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
|
||||
( void )nt; \
|
||||
( void )tid; \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
ctype* restrict c_begin = c_cast + (ic )*incc; \
|
||||
\
|
||||
ctype* restrict c_use = c_begin; \
|
||||
ctype* restrict p_use = p_begin; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. (The
|
||||
default is slab.) */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
PASTECH2(bao_,ch,packm_cxk) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
panel_len_max, \
|
||||
kappa_cast, \
|
||||
d, incd, \
|
||||
c_use, incc, ldc, \
|
||||
p_use, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( !row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
else \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
p_begin += ps_p; \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_var1 )
|
||||
GENTFUNC( float, s, packm_var1 )
|
||||
GENTFUNC( double, d, packm_var1 )
|
||||
GENTFUNC( scomplex, c, packm_var1 )
|
||||
GENTFUNC( dcomplex, z, packm_var1 )
|
||||
|
||||
245
addon/gemmd/bao_l3_packm_var2.c
Normal file
245
addon/gemmd/bao_l3_packm_var2.c
Normal file
@@ -0,0 +1,245 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// Variant 2 is similar to variant 1, but inlines the contents of packm_cxk().
|
||||
//
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict d, inc_t incd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic; \
|
||||
dim_t ic0; \
|
||||
doff_t ic_inc; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_dim_max; \
|
||||
inc_t incc; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
bool row_stored = bli_is_col_packed( schema ); \
|
||||
/*bool col_stored = bli_is_row_packed( schema );*/ \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
} \
|
||||
\
|
||||
ctype* restrict p_begin = p_cast; \
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
|
||||
( void )nt; \
|
||||
( void )tid; \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
ctype* restrict c_begin = c_cast + (ic )*incc; \
|
||||
\
|
||||
ctype* restrict c_use = c_begin; \
|
||||
ctype* restrict p_use = p_begin; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. (The
|
||||
default is slab.) */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
|
||||
we're wrong, this will get someone's attention. */ \
|
||||
if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) \
|
||||
bli_abort(); \
|
||||
\
|
||||
/* Perform the packing, taking conjc into account. */ \
|
||||
if ( bli_is_conj( conjc ) ) \
|
||||
{ \
|
||||
for ( dim_t l = 0; l < panel_len; ++l ) \
|
||||
{ \
|
||||
for ( dim_t d = 0; d < panel_dim; ++d ) \
|
||||
{ \
|
||||
ctype* cld = c_use + (l )*ldc + (d )*incc; \
|
||||
ctype* pld = p_use + (l )*ldp + (d )*1; \
|
||||
\
|
||||
PASTEMAC(ch,copyjs)( *cld, *pld ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( dim_t l = 0; l < panel_len; ++l ) \
|
||||
{ \
|
||||
for ( dim_t d = 0; d < panel_dim; ++d ) \
|
||||
{ \
|
||||
ctype* cld = c_use + (l )*ldc + (d )*incc; \
|
||||
ctype* pld = p_use + (l )*ldp + (d )*1; \
|
||||
\
|
||||
PASTEMAC(ch,copys)( *cld, *pld ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
|
||||
if ( panel_dim < panel_dim_max ) \
|
||||
{ \
|
||||
const dim_t i = panel_dim; \
|
||||
const dim_t m_edge = panel_dim_max - panel_dim; \
|
||||
const dim_t n_edge = panel_len_max; \
|
||||
ctype* restrict p_edge = p_use + (i )*1; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
|
||||
if ( panel_len < panel_len_max ) \
|
||||
{ \
|
||||
const dim_t j = panel_len; \
|
||||
const dim_t m_edge = panel_dim_max; \
|
||||
const dim_t n_edge = panel_len_max - panel_len; \
|
||||
ctype* restrict p_edge = p_use + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( !row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
else \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
p_begin += ps_p; \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_var1 )
|
||||
GENTFUNC( float, s, packm_var2 )
|
||||
GENTFUNC( double, d, packm_var2 )
|
||||
GENTFUNC( scomplex, c, packm_var2 )
|
||||
GENTFUNC( dcomplex, z, packm_var2 )
|
||||
|
||||
199
addon/gemmd/bao_packm_cxk.c
Normal file
199
addon/gemmd/bao_packm_cxk.c
Normal file
@@ -0,0 +1,199 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,opname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_len_max, \
|
||||
ctype* kappa, \
|
||||
ctype* d, inc_t incd, \
|
||||
ctype* a, inc_t inca, inc_t lda, \
|
||||
ctype* p, inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
/* Note that we use panel_dim_max, not panel_dim, to query the packm
|
||||
kernel function pointer. This means that we always use the same
|
||||
kernel, even for edge cases. */ \
|
||||
num_t dt = PASTEMAC(ch,type); \
|
||||
l1mkr_t ker_id = panel_dim_max; \
|
||||
\
|
||||
PASTECH2(ch,opname,_ker_ft) f; \
|
||||
\
|
||||
/* Query the context for the packm kernel corresponding to the current
|
||||
panel dimension, or kernel id. If the id is invalid, the function will
|
||||
return NULL. */ \
|
||||
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
|
||||
\
|
||||
/* If there exists a kernel implementation for the micro-panel dimension
|
||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||
/* NOTE: We've disabled calling packm micro-kernels from the context for
|
||||
this implementation. To re-enable, change FALSE to TRUE in the
|
||||
conditional below. */ \
|
||||
if ( f != NULL && FALSE ) \
|
||||
{ \
|
||||
f \
|
||||
( \
|
||||
conja, \
|
||||
schema, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
panel_len_max, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* NOTE: We assume here that kappa = 1 and therefore ignore it. If
|
||||
we're wrong, this will get someone's attention. */ \
|
||||
if ( !PASTEMAC(ch,eq1)( *kappa ) ) \
|
||||
bli_abort(); \
|
||||
\
|
||||
if ( d == NULL ) \
|
||||
{ \
|
||||
/* Perform the packing, taking conja into account. */ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t l = 0; l < panel_len; ++l ) \
|
||||
{ \
|
||||
for ( dim_t i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype* ali = a + (l )*lda + (i )*inca; \
|
||||
ctype* pli = p + (l )*ldp + (i )*1; \
|
||||
\
|
||||
PASTEMAC(ch,copyjs)( *ali, *pli ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( dim_t l = 0; l < panel_len; ++l ) \
|
||||
{ \
|
||||
for ( dim_t i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype* ali = a + (l )*lda + (i )*inca; \
|
||||
ctype* pli = p + (l )*ldp + (i )*1; \
|
||||
\
|
||||
PASTEMAC(ch,copys)( *ali, *pli ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( d != NULL ) */ \
|
||||
{ \
|
||||
/* Perform the packing, taking conja into account. */ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( dim_t l = 0; l < panel_len; ++l ) \
|
||||
{ \
|
||||
for ( dim_t i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype* ali = a + (l )*lda + (i )*inca; \
|
||||
ctype* dl = d + (l )*incd; \
|
||||
ctype* pli = p + (l )*ldp + (i )*1; \
|
||||
\
|
||||
/* Note that ali must be the second operand here since
|
||||
that is what is conjugated by scal2js. */ \
|
||||
PASTEMAC(ch,scal2js)( *dl, *ali, *pli ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( dim_t l = 0; l < panel_len; ++l ) \
|
||||
{ \
|
||||
for ( dim_t i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype* ali = a + (l )*lda + (i )*inca; \
|
||||
ctype* dl = d + (l )*incd; \
|
||||
ctype* pli = p + (l )*ldp + (i )*1; \
|
||||
\
|
||||
PASTEMAC(ch,scal2s)( *ali, *dl, *pli ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
|
||||
if ( panel_dim < panel_dim_max ) \
|
||||
{ \
|
||||
const dim_t i = panel_dim; \
|
||||
const dim_t m_edge = panel_dim_max - panel_dim; \
|
||||
const dim_t n_edge = panel_len_max; \
|
||||
ctype* restrict p_edge = p + (i )*1; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
|
||||
if ( panel_len < panel_len_max ) \
|
||||
{ \
|
||||
const dim_t j = panel_len; \
|
||||
const dim_t m_edge = panel_dim_max; \
|
||||
const dim_t n_edge = panel_len_max - panel_len; \
|
||||
ctype* restrict p_edge = p + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,set0s_mxn) \
|
||||
( \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
p_edge, 1, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
//INSERT_GENTFUNC_BASIC0( packm_cxk )
|
||||
GENTFUNC( float, s, packm_cxk )
|
||||
GENTFUNC( double, d, packm_cxk )
|
||||
GENTFUNC( scomplex, c, packm_cxk )
|
||||
GENTFUNC( dcomplex, z, packm_cxk )
|
||||
|
||||
59
addon/gemmd/bao_packm_cxk.h
Normal file
59
addon/gemmd/bao_packm_cxk.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(bao_,ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
pack_t schema, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_len_max, \
|
||||
ctype* kappa, \
|
||||
ctype* d, inc_t incd, \
|
||||
ctype* a, inc_t inca, inc_t lda, \
|
||||
ctype* p, inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
//INSERT_GENTPROT_BASIC0( packm_cxk )
|
||||
GENTPROT( float, s, packm_cxk )
|
||||
GENTPROT( double, d, packm_cxk )
|
||||
GENTPROT( scomplex, c, packm_cxk )
|
||||
GENTPROT( dcomplex, z, packm_cxk )
|
||||
|
||||
54
addon/gemmd/gemmd.h
Normal file
54
addon/gemmd/gemmd.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of copyright holder(s) nor the names
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef GEMMD_H
|
||||
#define GEMMD_H
|
||||
|
||||
// This header should contain (or #include) any definitions that must be
|
||||
// folded into blis.h.
|
||||
|
||||
#include "bao_gemmd.h"
|
||||
#include "bao_gemmd_check.h"
|
||||
#include "bao_gemmd_var.h"
|
||||
|
||||
#include "bao_l3_packm_a.h"
|
||||
#include "bao_l3_packm_b.h"
|
||||
#include "bao_l3_packm_var.h"
|
||||
|
||||
#include "bao_packm_cxk.h"
|
||||
|
||||
#include "bao_l3_decor.h"
|
||||
|
||||
|
||||
#endif
|
||||
75
addon/gemmd/thread/bao_l3_decor.h
Normal file
75
addon/gemmd/thread/bao_l3_decor.h
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_H
|
||||
#define BLIS_SBX_L3_DECOR_H
|
||||
|
||||
// -- sup definitions ----------------------------------------------------------
|
||||
|
||||
// Level-3 sup internal function type.
|
||||
typedef void (*l3sbxint_t)
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Level-3 sup thread decorator prototype.
|
||||
void bao_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
// Include definitions specific to the method of multithreading.
|
||||
#include "bao_l3_decor_single.h"
|
||||
#include "bao_l3_decor_openmp.h"
|
||||
#include "bao_l3_decor_pthreads.h"
|
||||
|
||||
#endif
|
||||
|
||||
140
addon/gemmd/thread/bao_l3_decor_openmp.c
Normal file
140
addon/gemmd/thread/bao_l3_decor_openmp.c
Normal file
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
// Define a dummy thread entry function, which is needed in the pthreads
|
||||
// version, so that when building Windows DLLs (with OpenMP enabled or with
|
||||
// no multithreading) we don't risk having an unresolved symbol.
|
||||
void* bao_l3_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
//#define PRINT_THRINFO
|
||||
|
||||
void bao_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// Query the total number of threads from the rntm_t object.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_pba_rntm_set_pba( rntm );
|
||||
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
d,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
44
addon/gemmd/thread/bao_l3_decor_openmp.h
Normal file
44
addon/gemmd/thread/bao_l3_decor_openmp.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_OPENMP_H
|
||||
#define BLIS_SBX_L3_DECOR_OPENMP_H
|
||||
|
||||
// Definitions specific to situations when OpenMP multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
220
addon/gemmd/thread/bao_l3_decor_pthreads.c
Normal file
220
addon/gemmd/thread/bao_l3_decor_pthreads.c
Normal file
@@ -0,0 +1,220 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// A data structure to assist in passing operands to additional threads.
|
||||
typedef struct thread_data
|
||||
{
|
||||
l3sbxint_t func;
|
||||
opid_t family;
|
||||
obj_t* alpha;
|
||||
obj_t* a;
|
||||
obj_t* d;
|
||||
obj_t* b;
|
||||
obj_t* beta;
|
||||
obj_t* c;
|
||||
cntx_t* cntx;
|
||||
rntm_t* rntm;
|
||||
dim_t tid;
|
||||
thrcomm_t* gl_comm;
|
||||
array_t* array;
|
||||
} thread_data_t;
|
||||
|
||||
// Entry point function for additional threads.
|
||||
void* bao_l3_thread_entry( void* data_void )
|
||||
{
|
||||
thread_data_t* data = data_void;
|
||||
|
||||
l3sbxint_t func = data->func;
|
||||
opid_t family = data->family;
|
||||
obj_t* alpha = data->alpha;
|
||||
obj_t* a = data->a;
|
||||
obj_t* d = data->d;
|
||||
obj_t* b = data->b;
|
||||
obj_t* beta = data->beta;
|
||||
obj_t* c = data->c;
|
||||
cntx_t* cntx = data->cntx;
|
||||
rntm_t* rntm = data->rntm;
|
||||
dim_t tid = data->tid;
|
||||
array_t* array = data->array;
|
||||
thrcomm_t* gl_comm = data->gl_comm;
|
||||
|
||||
( void )family;
|
||||
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
d,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void bao_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
err_t r_val;
|
||||
|
||||
// Query the total number of threads from the context.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_pba_rntm_set_pba( rntm );
|
||||
|
||||
// Allocate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
// Allocate an array of pthread objects and auxiliary data structs to pass
|
||||
// to the thread entry functions.
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val );
|
||||
|
||||
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
|
||||
// can spawn all other threads before proceeding with its own computation.
|
||||
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
|
||||
{
|
||||
// Set up thread data for additional threads (beyond thread 0).
|
||||
datas[tid].func = func;
|
||||
datas[tid].family = family;
|
||||
datas[tid].alpha = alpha;
|
||||
datas[tid].a = a;
|
||||
datas[tid].d = d;
|
||||
datas[tid].b = b;
|
||||
datas[tid].beta = beta;
|
||||
datas[tid].c = c;
|
||||
datas[tid].cntx = cntx;
|
||||
datas[tid].rntm = rntm;
|
||||
datas[tid].tid = tid;
|
||||
datas[tid].gl_comm = gl_comm;
|
||||
datas[tid].array = array;
|
||||
|
||||
// Spawn additional threads for ids greater than 1.
|
||||
if ( tid != 0 )
|
||||
bli_pthread_create( &pthreads[tid], NULL, &bao_l3_thread_entry, &datas[tid] );
|
||||
else
|
||||
bao_l3_thread_entry( ( void* )(&datas[0]) );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Thread 0 waits for additional threads to finish.
|
||||
for ( dim_t tid = 1; tid < n_threads; tid++ )
|
||||
{
|
||||
bli_pthread_join( pthreads[tid], NULL );
|
||||
}
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( pthreads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( datas );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
47
addon/gemmd/thread/bao_l3_decor_pthreads.h
Normal file
47
addon/gemmd/thread/bao_l3_decor_pthreads.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H
|
||||
#define BLIS_SBX_L3_DECOR_PTHREADS_H
|
||||
|
||||
// Definitions specific to situations when POSIX multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// Thread entry point prototype.
|
||||
void* bao_l3_thread_entry( void* data_void );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
143
addon/gemmd/thread/bao_l3_decor_single.c
Normal file
143
addon/gemmd/thread/bao_l3_decor_single.c
Normal file
@@ -0,0 +1,143 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
#define SKIP_THRINFO_TREE
|
||||
|
||||
void bao_l3_thread_decorator
|
||||
(
|
||||
l3sbxint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* d,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_pba_rntm_set_pba( rntm );
|
||||
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
|
||||
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
// There is only one thread id (for the thief thread).
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
#else
|
||||
// This optimization allows us to use one of the global thrinfo_t
|
||||
// objects for single-threaded execution rather than grow one from
|
||||
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
|
||||
// from within the variants, will immediately return if it detects
|
||||
// that the thrinfo_t* passed into it is either
|
||||
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
|
||||
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
|
||||
|
||||
( void )tid;
|
||||
#endif
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
d,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
44
addon/gemmd/thread/bao_l3_decor_single.h
Normal file
44
addon/gemmd/thread/bao_l3_decor_single.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_SBX_L3_DECOR_SINGLE_H
|
||||
#define BLIS_SBX_L3_DECOR_SINGLE_H
|
||||
|
||||
// Definitions specific to situations when multithreading is disabled.
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
47
build/bli_addon.h.in
Normal file
47
build/bli_addon.h.in
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_ADDON_H
|
||||
#define BLIS_ADDON_H
|
||||
|
||||
#if @enable_addons@
|
||||
#define BLIS_ENABLE_ADDONS
|
||||
#else
|
||||
#define BLIS_DISABLE_ADDONS
|
||||
#endif
|
||||
|
||||
// Enabled addons
|
||||
@addon_list_includes@
|
||||
|
||||
#endif
|
||||
@@ -183,6 +183,10 @@ MK_ENABLE_CBLAS := @enable_cblas@
|
||||
# Whether libblis will depend on libmemkind for certain memory allocations.
|
||||
MK_ENABLE_MEMKIND := @enable_memkind@
|
||||
|
||||
# The names of the addons to include when building BLIS. If empty, no addons
|
||||
# will be included.
|
||||
ADDON_LIST := @addon_list@
|
||||
|
||||
# The name of a sandbox defining an alternative gemm implementation. If empty,
|
||||
# no sandbox will be used and the conventional gemm implementation will remain
|
||||
# enabled.
|
||||
|
||||
126
common.mk
126
common.mk
@@ -161,18 +161,35 @@ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
|
||||
|
||||
# When compiling sandboxes, we use flags similar to those of general framework
|
||||
# source. This ensures that the same code can be linked and run across various
|
||||
# sub-configurations. (If we switch to using refkern/kernel flags, we should
|
||||
# prevent enabling sandboxes for umbrella families by verifying that
|
||||
# config_list == config_name if --enable-sandbox is given.)
|
||||
# sub-configurations.
|
||||
get-addon-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(CADDONINCFLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
get-addon-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cxxflags-for,$(1)) \
|
||||
$(CADDONINCFLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
# When compiling sandboxes, we use flags similar to those of general framework
|
||||
# source. This ensures that the same code can be linked and run across various
|
||||
# sub-configurations. (NOTE: If we ever switch to using refkernel or kernel
|
||||
# flags, we should prevent enabling sandboxes for umbrella families by verifying
|
||||
# that config_list == config_name if --enable-sandbox is given. THIS ALSO
|
||||
# APPLIES TO ADDONS ABOVE.)
|
||||
get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(CSBOXINCFLAGS) \
|
||||
$(CSANDINCFLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
$(call get-noopt-cxxflags-for,$(1)) \
|
||||
$(CSBOXINCFLAGS) \
|
||||
$(CSANDINCFLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
@@ -198,6 +215,8 @@ get-config-text-for = "('$(1)' CFLAGS for config code)"
|
||||
get-frame-text-for = "('$(1)' CFLAGS for framework code)"
|
||||
get-aocldtl-text-for = "('$(1)' CFLAGS for AOCL debug and trace code)"
|
||||
get-kernel-text-for = "('$(1)' CFLAGS for kernels)"
|
||||
get-addon-c99text-for = "('$(1)' CFLAGS for addons)"
|
||||
get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)"
|
||||
get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)"
|
||||
get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
|
||||
|
||||
@@ -212,6 +231,10 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)"
|
||||
files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),)))
|
||||
files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f))))
|
||||
|
||||
# Define a function that removes duplicate strings *without* using the sort
|
||||
# function.
|
||||
rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1)))
|
||||
|
||||
|
||||
#
|
||||
# --- Include makefile configuration file --------------------------------------
|
||||
@@ -297,6 +320,7 @@ FRAME_DIR := frame
|
||||
AOCLDTL_DIR := aocl_dtl
|
||||
REFKERN_DIR := ref_kernels
|
||||
KERNELS_DIR := kernels
|
||||
ADDON_DIR := addon
|
||||
SANDBOX_DIR := sandbox
|
||||
OBJ_DIR := obj
|
||||
LIB_DIR := lib
|
||||
@@ -313,12 +337,13 @@ REFNM := ref
|
||||
|
||||
# Source suffixes.
|
||||
CONFIG_SRC_SUFS := c
|
||||
|
||||
KERNELS_SRC_SUFS := c s S
|
||||
|
||||
FRAME_SRC_SUFS := c
|
||||
|
||||
AOCLDTL_SRC_SUFS := c
|
||||
ADDON_C99_SUFS := c
|
||||
ADDON_CXX_SUFS := cc cpp cxx
|
||||
ADDON_SRC_SUFS := $(ADDON_C99_SUFS) $(ADDON_CXX_SUFS)
|
||||
|
||||
SANDBOX_C99_SUFS := c
|
||||
SANDBOX_CXX_SUFS := cc cpp cxx
|
||||
@@ -328,6 +353,9 @@ SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS)
|
||||
FRAME_HDR_SUFS := h
|
||||
|
||||
AOCLDTL_HDR_SUFS := h
|
||||
ADDON_H99_SUFS := h
|
||||
ADDON_HXX_SUFS := hh hpp hxx
|
||||
ADDON_HDR_SUFS := $(ADDON_H99_SUFS) $(ADDON_HXX_SUFS)
|
||||
|
||||
SANDBOX_H99_SUFS := h
|
||||
SANDBOX_HXX_SUFS := hh hpp hxx
|
||||
@@ -335,10 +363,12 @@ SANDBOX_HDR_SUFS := $(SANDBOX_H99_SUFS) $(SANDBOX_HXX_SUFS)
|
||||
|
||||
# Combine all header suffixes and remove duplicates via sort().
|
||||
ALL_HDR_SUFS := $(sort $(FRAME_HDR_SUFS) \
|
||||
$(ADDON_HDR_SUFS) \
|
||||
$(SANDBOX_HDR_SUFS) \
|
||||
$(AOCLDTL_HDR_SUFS))
|
||||
|
||||
ALL_H99_SUFS := $(sort $(FRAME_HDR_SUFS) \
|
||||
$(ADDON_HDR_SUFS) \
|
||||
$(SANDBOX_H99_SUFS) \
|
||||
$(AOCLDTL_HDR_SUFS))
|
||||
|
||||
@@ -366,12 +396,14 @@ SHELL := bash
|
||||
|
||||
# Construct paths to the four primary directories of source code:
|
||||
# the config directory, general framework code, reference kernel code,
|
||||
# and optimized kernel code.
|
||||
# and optimized kernel code. Also process paths for addon and sandbox
|
||||
# directories.
|
||||
CONFIG_PATH := $(DIST_PATH)/$(CONFIG_DIR)
|
||||
FRAME_PATH := $(DIST_PATH)/$(FRAME_DIR)
|
||||
AOCLDTL_PATH := $(DIST_PATH)/$(AOCLDTL_DIR)
|
||||
REFKERN_PATH := $(DIST_PATH)/$(REFKERN_DIR)
|
||||
KERNELS_PATH := $(DIST_PATH)/$(KERNELS_DIR)
|
||||
ADDON_PATH := $(DIST_PATH)/$(ADDON_DIR)
|
||||
SANDBOX_PATH := $(DIST_PATH)/$(SANDBOX_DIR)
|
||||
|
||||
# Construct paths to some optional C++ template headers contributed by AMD.
|
||||
@@ -386,6 +418,7 @@ FRAME_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(FRAME_DIR)
|
||||
AOCLDTL_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(AOCLDTL_DIR)
|
||||
REFKERN_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(REFKERN_DIR)
|
||||
KERNELS_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(KERNELS_DIR)
|
||||
ADDON_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(ADDON_DIR)
|
||||
SANDBOX_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR)
|
||||
|
||||
|
||||
@@ -863,6 +896,7 @@ MK_KERNELS_SRC :=
|
||||
MK_REFKERN_SRC :=
|
||||
MK_FRAME_SRC :=
|
||||
MK_AOCLDTL_SRC :=
|
||||
MK_ADDON_SRC :=
|
||||
MK_SANDBOX_SRC :=
|
||||
|
||||
# -- config --
|
||||
@@ -914,6 +948,24 @@ PARENT_PATH := $(OBJ_DIR)/$(CONFIG_NAME)
|
||||
-include $(addsuffix /$(FRAGMENT_MK), $(FRAME_FRAG_PATH))
|
||||
-include $(addsuffix /$(FRAGMENT_MK), $(AOCLDTL_FRAG_PATH))
|
||||
|
||||
# -- addon --
|
||||
|
||||
# Construct paths to each addon.
|
||||
# NOTE: If $(ADDON_LIST) is empty (because no addon was enabled at configure-
|
||||
# time) then $(ADDON_PATHS) will also be empty, which will cause no fragments
|
||||
# to be included.
|
||||
ADDON_PATHS := $(addprefix $(ADDON_FRAG_PATH)/, $(ADDON_LIST))
|
||||
|
||||
# This variable is used by the include statements as they recursively include
|
||||
# one another. For the 'addons' directory, we initialize it to that directory
|
||||
# in preparation to include the fragments in the configuration sub-directory.
|
||||
PARENT_SRC_PATH := $(ADDON_PATH)
|
||||
PARENT_PATH := $(ADDON_FRAG_PATH)
|
||||
|
||||
# Recursively include the makefile fragments in each of the addons sub-
|
||||
# directories.
|
||||
-include $(addsuffix /$(FRAGMENT_MK), $(ADDON_PATHS))
|
||||
|
||||
# -- sandbox --
|
||||
|
||||
# Construct paths to each sandbox. (At present, there can be only one.)
|
||||
@@ -931,6 +983,8 @@ PARENT_PATH := $(SANDBOX_FRAG_PATH)
|
||||
# Recursively include the makefile fragments in the sandbox sub-directory.
|
||||
-include $(addsuffix /$(FRAGMENT_MK), $(SANDBOX_PATHS))
|
||||
|
||||
# -- post-processing --
|
||||
|
||||
# Create a list of the makefile fragments using the variable into which each
|
||||
# of the above include statements accumulated their directory paths.
|
||||
MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS))
|
||||
@@ -949,14 +1003,14 @@ endif
|
||||
#
|
||||
|
||||
# Define a function that will expand all of the directory paths given in $(1)
|
||||
# to actual filepaths using the list of suffixes provided $(2).
|
||||
# to actual filepaths using the list of suffixes provided in $(2).
|
||||
get-filepaths = $(strip $(foreach path, $(1), \
|
||||
$(foreach suf, $(2), \
|
||||
$(wildcard $(path)/*.$(suf)) \
|
||||
) ) )
|
||||
|
||||
# Define a function that will expand all of the directory paths given in $(1)
|
||||
# to actual filepaths using the list of suffixes provided $(2), taking only
|
||||
# to actual filepaths using the list of suffixes provided in $(2), taking only
|
||||
# the first expansion from each directory with at least one file matching
|
||||
# the current suffix. Finally, strip the filenames from all resulting files,
|
||||
# returning only the directory paths.
|
||||
@@ -966,20 +1020,29 @@ get-dirpaths = $(dir $(foreach path, $(1), \
|
||||
$(wildcard $(path)/*.$(suf)) \
|
||||
) ) ) )
|
||||
|
||||
# We'll use two directory lists. The first is a list of all of the directories
|
||||
# in which makefile fragments were generated (plus the current directory). The
|
||||
# second is the subset of the first that begins with the sandbox root path.
|
||||
# We'll use three directory lists. The first is a list of all of the directories
|
||||
# in which makefile fragments were generated, plus the current directory. (The
|
||||
# current directory is needed so we include bli_config.h and bli_addon.h in the
|
||||
# processing of header files.) The second and third are subsets of the first
|
||||
# that begins with the addon and sandbox root paths, respectively.
|
||||
ALLFRAG_DIR_PATHS := . $(FRAGMENT_DIR_PATHS)
|
||||
ADDON_DIR_PATHS := $(filter $(ADDON_PATH)/%,$(ALLFRAG_DIR_PATHS))
|
||||
SANDBOX_DIR_PATHS := $(filter $(SANDBOX_PATH)/%,$(ALLFRAG_DIR_PATHS))
|
||||
|
||||
ALL_H99_FILES := $(call get-filepaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
|
||||
FRAME_H99_FILES := $(filter-out $(SANDBOX_PATH)/%,$(ALL_H99_FILES))
|
||||
FRAME_H99_FILES := $(filter-out $(ADDON_PATH)/%, \
|
||||
$(filter-out $(SANDBOX_PATH)/%, \
|
||||
$(ALL_H99_FILES) \
|
||||
) )
|
||||
|
||||
ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
|
||||
ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS))
|
||||
|
||||
SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS))
|
||||
SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS))
|
||||
ADDON_H99_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_H99_SUFS))
|
||||
ADDON_HXX_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_HXX_SUFS))
|
||||
ADDON_HDR_DIRPATHS := $(call get-dirpaths,$(ADDON_DIR_PATHS),$(ALL_HDR_SUFS))
|
||||
|
||||
SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS))
|
||||
SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS))
|
||||
SANDBOX_HDR_DIRPATHS := $(call get-dirpaths,$(SANDBOX_DIR_PATHS),$(ALL_HDR_SUFS))
|
||||
|
||||
|
||||
@@ -1032,8 +1095,8 @@ CBLAS_H_FLAT := $(BASE_INC_PATH)/$(CBLAS_H)
|
||||
#
|
||||
|
||||
# Obtain a list of header files #included inside of the bli_cntx_ref.c file.
|
||||
# Paths to these files will be needed when compiling with the monolithic
|
||||
# header.
|
||||
# Due to the way that bli_cntx_ref.c uses headers and macros, paths to these
|
||||
# files will be needed when compiling bli_cntx_ref.c with the monolithic header.
|
||||
ifeq ($(strip $(SHARE_PATH)),.)
|
||||
REF_KER_SRC := $(DIST_PATH)/$(REFKERN_DIR)/bli_cntx_ref.c
|
||||
REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H))
|
||||
@@ -1041,9 +1104,10 @@ endif
|
||||
|
||||
# Match each header found above with the path to that header, and then strip
|
||||
# leading, trailing, and internal whitespace.
|
||||
REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \
|
||||
$(dir $(filter %/$(header), \
|
||||
$(FRAME_H99_FILES)))))
|
||||
REF_KER_H_PATHS := $(call rm-dups,$(strip \
|
||||
$(foreach header, $(REF_KER_HEADERS), \
|
||||
$(dir $(filter %/$(header), \
|
||||
$(FRAME_H99_FILES))))))
|
||||
|
||||
# Add -I to each header path so we can specify our include search paths to the
|
||||
# C compiler. Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h.
|
||||
@@ -1055,17 +1119,29 @@ REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include
|
||||
# now #include the monolithic/flattened blis.h instead.
|
||||
CINCFLAGS := -I$(BASE_INC_PATH) $(REF_KER_I_PATHS)
|
||||
|
||||
# If CBLAS is enabled, we also include the path to the cblas.h directory so
|
||||
# that the compiler will be able to find cblas.h as the CBLAS source code is
|
||||
# being compiled.
|
||||
ifeq ($(MK_ENABLE_CBLAS),yes)
|
||||
CINCFLAGS += -I$(CBLAS_H_DIRPATH)
|
||||
endif
|
||||
|
||||
# Obtain a list of header paths in the configured addons. Then add -I to each
|
||||
# header path.
|
||||
CADDONINCFLAGS := $(strip $(patsubst %, -I%, $(ADDON_HDR_DIRPATHS)))
|
||||
|
||||
# Obtain a list of header paths in the configured sandbox. Then add -I to each
|
||||
# header path.
|
||||
CSBOXINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS)))
|
||||
CSANDINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS)))
|
||||
|
||||
|
||||
#
|
||||
# --- BLIS configuration header definitions ------------------------------------
|
||||
#
|
||||
|
||||
# This file was created by configure, but we need to define it here so we can
|
||||
# remove it as part of the clean targets.
|
||||
# These files were created by configure, but we need to define them here so we
|
||||
# can remove them as part of the clean targets.
|
||||
BLIS_ADDON_H := ./bli_addon.h
|
||||
BLIS_CONFIG_H := ./bli_config.h
|
||||
|
||||
|
||||
|
||||
151
configure
vendored
151
configure
vendored
@@ -264,6 +264,15 @@ print_usage()
|
||||
echo " \"small\" depends on thresholds that may vary by sub-"
|
||||
echo " configuration."
|
||||
echo " "
|
||||
echo " -a NAME --enable-addon=NAME"
|
||||
echo " "
|
||||
echo " Enable the code provided by an addon. An addon consists"
|
||||
echo " of a separate directory of code that provides additional"
|
||||
echo " APIs, implementations, and/or operations that would"
|
||||
echo " otherwise not be present within a build of BLIS. This"
|
||||
echo " option may be used multiple times to specify the inclusion"
|
||||
echo " of multiple addons. By default, no addons are enabled."
|
||||
echo " "
|
||||
echo " -s NAME --enable-sandbox=NAME"
|
||||
echo " "
|
||||
echo " Enable a separate sandbox implementation of gemm. This"
|
||||
@@ -940,6 +949,18 @@ canonicalize_ws()
|
||||
echo "${str}"
|
||||
}
|
||||
|
||||
rm_duplicate_words_simple()
|
||||
{
|
||||
local str revstr revres res
|
||||
|
||||
str="$1"
|
||||
|
||||
# Remote duplicates, keeping the first occurrence.
|
||||
res=$(echo "${str}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}{printf("\n")}')
|
||||
|
||||
echo "${res}"
|
||||
}
|
||||
|
||||
rm_duplicate_words()
|
||||
{
|
||||
local str revstr revres res
|
||||
@@ -1915,6 +1936,13 @@ main()
|
||||
bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}"
|
||||
bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}"
|
||||
|
||||
# The names/paths for the template bli_addon.h.in and its instantiated
|
||||
# counterpart.
|
||||
bli_addon_h_in='bli_addon.h.in'
|
||||
bli_addon_h_out='bli_addon.h'
|
||||
bli_addon_h_in_path="${build_dirpath}/${bli_addon_h_in}"
|
||||
bli_addon_h_out_path="${cur_dirpath}/${bli_addon_h_out}"
|
||||
|
||||
# Path to 'mirror-tree.sh' script.
|
||||
mirror_tree_sh="${build_dirpath}/mirror-tree.sh"
|
||||
|
||||
@@ -1941,6 +1969,9 @@ main()
|
||||
# The root directory of the BLIS framework.
|
||||
aocldtl_dir='aocl_dtl'
|
||||
aocldtl_dirpath="${dist_path}/${aocldtl_dir}"
|
||||
# The names of the addons.
|
||||
addon_dir='addon'
|
||||
addon_dirpath="${dist_path}/${addon_dir}"
|
||||
|
||||
# The name of the sandbox directory.
|
||||
sandbox_dir='sandbox'
|
||||
@@ -2049,6 +2080,10 @@ main()
|
||||
force_version='no'
|
||||
complex_return='default'
|
||||
|
||||
# The addon flag and names.
|
||||
addon_flag=''
|
||||
addon_list=''
|
||||
|
||||
# The sandbox flag and name.
|
||||
sandbox_flag=''
|
||||
sandbox=''
|
||||
@@ -2093,7 +2128,7 @@ main()
|
||||
|
||||
# Process our command line options.
|
||||
unset OPTIND
|
||||
while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do
|
||||
while getopts ":hp:d:e:a:s:t:r:qci:b:-:" opt; do
|
||||
case $opt in
|
||||
-)
|
||||
case "$OPTARG" in
|
||||
@@ -2194,12 +2229,21 @@ main()
|
||||
disable-mem-tracing)
|
||||
enable_mem_tracing='no'
|
||||
;;
|
||||
enable-addon=*)
|
||||
addon_flag=1
|
||||
addon_name=${OPTARG#*=}
|
||||
# Append the addon name to the list.
|
||||
addon_list="${addon_list} ${addon_name}"
|
||||
;;
|
||||
disable-addon)
|
||||
addon_flag=''
|
||||
;;
|
||||
enable-sandbox=*)
|
||||
sandbox_flag=1
|
||||
sandbox=${OPTARG#*=}
|
||||
;;
|
||||
disable-sandbox)
|
||||
sandbox_flag=0
|
||||
sandbox_flag=''
|
||||
;;
|
||||
int-size=*)
|
||||
int_type_size=${OPTARG#*=}
|
||||
@@ -2282,6 +2326,12 @@ main()
|
||||
e)
|
||||
export_shared=$OPTARG
|
||||
;;
|
||||
a)
|
||||
addon_flag=1
|
||||
addon_name=$OPTARG
|
||||
# Append the addon name to the list.
|
||||
addon_list="${addon_list} ${addon_name}"
|
||||
;;
|
||||
s)
|
||||
sandbox_flag=1
|
||||
sandbox=$OPTARG
|
||||
@@ -3141,6 +3191,34 @@ main()
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if addons were given.
|
||||
if [ -n "${addon_flag}" ]; then
|
||||
|
||||
# Remove duplicates in the addon list, if they exist.
|
||||
addon_list=$(rm_duplicate_words_simple "${addon_list}")
|
||||
|
||||
echo "${script_name}: configuring with addons:"
|
||||
|
||||
for addon in ${addon_list}; do
|
||||
|
||||
echo "${script_name}: ${addon_dir}/${addon}"
|
||||
|
||||
addon_fullpath="${addon_dirpath}/${addon}"
|
||||
|
||||
if [ ! -d "${addon_fullpath}" ]; then
|
||||
echo "${script_name}: requested addon sub-directory does not exist! Cannot continue."
|
||||
echo "${script_name}: *** Please verify addon existence and name."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
enable_addons_01=1
|
||||
else
|
||||
echo "${script_name}: configuring with no addons."
|
||||
|
||||
enable_addons_01=0
|
||||
fi
|
||||
|
||||
# Check if a sandbox was given.
|
||||
if [ -n "${sandbox_flag}" ]; then
|
||||
|
||||
@@ -3292,6 +3370,15 @@ main()
|
||||
kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n"
|
||||
done
|
||||
|
||||
# Create a list of #includes, one for each addon in addon_list.
|
||||
addon_list_includes=""
|
||||
for addon in ${addon_list}; do
|
||||
|
||||
# Create a #define and add it to the running list.
|
||||
addon_header="\"${addon}.h\""
|
||||
addon_list_includes="${addon_list_includes}#include ${addon_header}\n"
|
||||
done
|
||||
|
||||
|
||||
# -- Determine whether we are performing an out-of-tree build --------------
|
||||
|
||||
@@ -3319,7 +3406,7 @@ main()
|
||||
fi
|
||||
|
||||
|
||||
# -- Instantiate config.mk, bli_config.h files from templates --------------
|
||||
# -- Instantiate config.mk file from template ------------------------------
|
||||
|
||||
# Begin substituting information into the config_mk_in file, outputting
|
||||
# to config_mk_out.
|
||||
@@ -3365,6 +3452,7 @@ main()
|
||||
| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
|
||||
| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
|
||||
| sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \
|
||||
| sed -e "s/@addon_list@/${addon_list}/g" \
|
||||
| sed -e "s/@sandbox@/${sandbox}/g" \
|
||||
| sed -e "s/@enable_trsm_preinversion@/${enable_trsm_preinversion}/g" \
|
||||
| sed -e "s/@enable_aocl_dynamic@/${enable_aocl_dynamic}/g" \
|
||||
@@ -3373,6 +3461,7 @@ main()
|
||||
| sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen}/g" \
|
||||
> "${config_mk_out_path}"
|
||||
|
||||
# -- Instantiate bli_config.h file from template ---------------------------
|
||||
|
||||
# Begin substituting information into the bli_config_h_in file, outputting
|
||||
# to bli_config_h_out. NOTE: We use perl instead of sed because the version
|
||||
@@ -3409,6 +3498,17 @@ main()
|
||||
| sed -e "s/@complex_return_intel@/${complex_return_intel01}/g" \
|
||||
> "${bli_config_h_out_path}"
|
||||
|
||||
# -- Instantiate bli_addon.h file from template ----------------------------
|
||||
|
||||
# Begin substituting information into the bli_addon_h_in file, outputting
|
||||
# to bli_addon_h_out. NOTE: We use perl instead of sed because the version
|
||||
# of sed used on OS X is old and does not handle the '\n' character
|
||||
# intuitively, which was used when constructing ${addon_list_includes}.
|
||||
echo "${script_name}: creating ${bli_addon_h_out_path} from ${bli_addon_h_in_path}"
|
||||
cat "${bli_addon_h_in_path}" \
|
||||
| perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" \
|
||||
| sed -e "s/@enable_addons@/${enable_addons_01}/g" \
|
||||
> "${bli_addon_h_out_path}"
|
||||
|
||||
# -- Create top-level object directories -----------------------------------
|
||||
|
||||
@@ -3421,7 +3521,6 @@ main()
|
||||
|
||||
obj_config_dirpath="${base_obj_dirpath}/${config_dir}"
|
||||
|
||||
#echo "${script_name}: creating ${obj_config_dirpath}"
|
||||
mkdir -p ${obj_config_dirpath}
|
||||
for conf in ${config_list}; do
|
||||
echo "${script_name}: creating ${obj_config_dirpath}/${conf}"
|
||||
@@ -3431,7 +3530,6 @@ main()
|
||||
|
||||
obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}"
|
||||
|
||||
#echo "${script_name}: creating ${obj_kernels_dirpath}"
|
||||
mkdir -p ${obj_kernels_dirpath}
|
||||
for kern in ${kernel_list}; do
|
||||
echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}"
|
||||
@@ -3441,7 +3539,6 @@ main()
|
||||
|
||||
obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}"
|
||||
|
||||
#echo "${script_name}: creating ${obj_refkern_dirpath}"
|
||||
mkdir -p ${obj_refkern_dirpath}
|
||||
for conf in ${config_list}; do
|
||||
echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}"
|
||||
@@ -3460,6 +3557,18 @@ main()
|
||||
echo "${script_name}: creating ${obj_frame_dirpath}"
|
||||
mkdir -p ${obj_frame_dirpath}
|
||||
|
||||
|
||||
if [ -n "${addon_flag}" ]; then
|
||||
|
||||
obj_addon_dirpath="${base_obj_dirpath}/${addon_dir}"
|
||||
|
||||
for addon in ${addon_list}; do
|
||||
echo "${script_name}: creating ${obj_addon_dirpath}/${addon}"
|
||||
mkdir -p ${obj_addon_dirpath}/${addon}
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
if [ -n "${sandbox_flag}" ]; then
|
||||
|
||||
obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
|
||||
@@ -3487,6 +3596,7 @@ main()
|
||||
echo "${script_name}: creating ${base_lib_dirpath}"
|
||||
mkdir -p ${base_lib_dirpath}
|
||||
|
||||
|
||||
# Create include directory (if it does not already exist).
|
||||
base_include_dirpath="${include_dirpath}/${config_name}"
|
||||
|
||||
@@ -3545,6 +3655,16 @@ main()
|
||||
echo "${script_name}: mirroring ${aocldtl_dirpath} to ${obj_aocldtl_dirpath}"
|
||||
${mirror_tree_sh} ${aocldtl_dirpath} ${obj_aocldtl_dirpath}
|
||||
|
||||
# Mirror the chosen addon source tree to its object sub-directory.
|
||||
if [ -n "${addon_flag}" ]; then
|
||||
|
||||
for addon in ${addon_list}; do
|
||||
|
||||
echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}"
|
||||
${mirror_tree_sh} "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}"
|
||||
done
|
||||
fi
|
||||
|
||||
# Mirror the chosen sandbox source tree to its object sub-directory.
|
||||
if [ -n "${sandbox_flag}" ]; then
|
||||
|
||||
@@ -3643,6 +3763,25 @@ main()
|
||||
${gen_make_frags_dirpath}/suffix_list \
|
||||
${gen_make_frags_dirpath}/ignore_list
|
||||
|
||||
# Generate makefile fragments in the addon sub-directory.
|
||||
if [ -n "${addon_flag}" ]; then
|
||||
|
||||
for addon in ${addon_list}; do
|
||||
|
||||
echo "${script_name}: creating makefile fragments in ${obj_addon_dirpath}/${addon}"
|
||||
${gen_make_frags_sh} \
|
||||
-h -r -v0 \
|
||||
-o ${script_name} \
|
||||
-p 'ADDON' \
|
||||
${addon_dirpath}/${addon} \
|
||||
${obj_addon_dirpath}/${addon} \
|
||||
${gen_make_frags_dirpath}/fragment.mk \
|
||||
${gen_make_frags_dirpath}/suffix_list \
|
||||
${gen_make_frags_dirpath}/ignore_list
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
# Generate makefile fragments in the sandbox sub-directory.
|
||||
if [ -n "${sandbox_flag}" ]; then
|
||||
|
||||
|
||||
231
docs/Addons.md
Normal file
231
docs/Addons.md
Normal file
@@ -0,0 +1,231 @@
|
||||
## Contents
|
||||
|
||||
* **[Introduction](Addons.md#introduction)**
|
||||
* **[Enabling addons](Addons.md#enabling-addons)**
|
||||
* **[Addon rules](Addons.md#addon-rules)**
|
||||
* **[Caveats](Addons.md#caveats)**
|
||||
* **[Known issues](Addons.md#known-issues)**
|
||||
* **[Conclusion](Addons.md#conclusion)**
|
||||
|
||||
|
||||
## Introduction
|
||||
|
||||
This file briefly describes the requirements for building a custom BLIS
|
||||
*addon*.
|
||||
|
||||
Simply put, an addon in BLIS provides additional APIs, operations, and/or
|
||||
implementations that may be useful to certain users. An addon can be
|
||||
thought of as a standalone extension of BLIS that does not depend on any
|
||||
other addon, although addons may utilize existing functionality or kernels
|
||||
within the core framework.
|
||||
|
||||
By definition, an addon should *never* provide APIs that conflict with
|
||||
the interfaces that belong to either the [typed API](BLISTypedAPI.md) or the
|
||||
[object API](BLISObjectAPI.md). Thus, you'll never have to worry about a
|
||||
properly constructed (and properly functioning) addon interfering with or
|
||||
otherwise changing core BLIS functionality.
|
||||
|
||||
How does an addon differ from a [sandbox](Sandboxes.md)? Great question!
|
||||
Sometimes you want to include additional BLIS-like functionality that does
|
||||
not relate directly to `gemm` or any other BLIS operation.
|
||||
(By contrast, a sandbox requires you to implement `gemm` whether you want
|
||||
to or not.)
|
||||
Furthermore, you may wish to enable multiple addons simultaneously.
|
||||
(By contrast, only one sandbox may be enabled at a time.)
|
||||
Thus, the addon feature provides additional flexibility to some
|
||||
users in a way that sandboxes cannot, while still providing many of the
|
||||
conveniences of sandboxes.
|
||||
|
||||
## Enabling an addon
|
||||
|
||||
To enable an existing addon at configure-time, you simply specify it as an
|
||||
option to `configure`. Either of the following usages are accepted:
|
||||
```
|
||||
$ ./configure --enable-addon=foobar auto
|
||||
$ ./configure -a foobar auto
|
||||
```
|
||||
Here, we tell `configure` that we want to use the `foobar` addon, which
|
||||
corresponds to a subdirectory of the `addon` directory named `foobar`.
|
||||
(Reminder: the `auto` argument is the configuration target and
|
||||
unrelated to addons.)
|
||||
|
||||
You may also enable multiple addons within the same build of BLIS:
|
||||
```
|
||||
$ ./configure -a foobar -a thing1 -a thing2 auto
|
||||
```
|
||||
Note that the default behavior of `configure` is that no addons are enabled.
|
||||
|
||||
As `configure` runs, you should get output that includes lines
|
||||
similar to:
|
||||
```
|
||||
configure: configuring with addons:
|
||||
configure: addon/foobar
|
||||
configure: addon/thing1
|
||||
configure: addon/thing2
|
||||
```
|
||||
And when you build BLIS, the addon source code will be among the last files to
|
||||
be compiled:
|
||||
```
|
||||
Compiling obj/haswell/addon/foobar/foobar.o ('haswell' CFLAGS for addons)
|
||||
Compiling obj/haswell/addon/thing1/thing1.o ('haswell' CFLAGS for addons)
|
||||
Compiling obj/haswell/addon/thing1/thing1_api.o ('haswell' CFLAGS for addons)
|
||||
Compiling obj/haswell/addon/thing2/thing2_api.o ('haswell' CFLAGS for addons)
|
||||
...
|
||||
```
|
||||
That's it! After the BLIS library is built, it will contain your chosen
|
||||
addons. You can always confirm this by using `nm` to confirm the presence
|
||||
of your API symbols:
|
||||
```
|
||||
$ nm lib/haswell/libblis.a | grep foobar
|
||||
foobar.o:
|
||||
0000000000000000 T foobar
|
||||
```
|
||||
|
||||
## Addon rules
|
||||
|
||||
Please follow these guidelines for the best developer experience when
|
||||
creating addons.
|
||||
|
||||
1. As with sandboxes, you don't need to worry about creating makefiles. The
|
||||
BLIS build system will take care of this for you. :) By configuring BLIS with
|
||||
an addon enabled, `make` will scan your addon subdirectory and compile
|
||||
all of its source code using similar compilation rules as were used for the rest
|
||||
of the framework. In addition, the compilation command line will automatically
|
||||
contain one `-I<includepath>` option for every subdirectory in your addon,
|
||||
so it doesn't matter where in your addon directory hierarchy you place your
|
||||
header files -- they will be found!
|
||||
|
||||
2. We recommend that you write your addon in C99. While you *may* use C++11
|
||||
to implement your addon, you should provide a C99 wrapper API to your
|
||||
implementation so that others can interface with it. There is no guarantee
|
||||
that the end-user will be using a C++11 compiler, and therefore you should
|
||||
limit the definitions in your addon header to those that are C99 compliant.
|
||||
If you write your addon in C++11, you must use one of the BLIS-approved file
|
||||
extensions for your source files (`.cc`, `.cpp`, `.cxx`) and your local
|
||||
header files (`.hh`, `.hpp`, `.hxx`).
|
||||
Note that `blis.h` already contains all of its definitions inside of an
|
||||
`extern "C"` block, so you should be able to `#include "blis.h"` from your
|
||||
C++11 source code without any issues.
|
||||
|
||||
3. All of your code related to the addon should reside within the named
|
||||
addon directory, or some subdirectory therein. If your addon requires
|
||||
new kernels, you should add kernel source code to an appropriate
|
||||
microarchitecture-specific subdirectory within the top-level `kernels`
|
||||
directory so that they are compiled with the correct
|
||||
microarchitecture-specific optimization flags.
|
||||
|
||||
4. If your addon is named `foobar`, the BLIS build system will expect to
|
||||
find a header called `foobar.h` somewhere in the `addon/foobar` directory
|
||||
(or one of its subdirectories). This `foobar.h` header will automatically
|
||||
be inlined into the monolithic `blis.h` header that is produced by the
|
||||
BLIS build system. `foobar.h` may `#include` other local headers, each of
|
||||
which will also (recursively) get inlined into `blis.h`. However, you may
|
||||
choose to omit some local addon headers from `foobar.h.` You might do this,
|
||||
for example, because those headers define things that are not needed in
|
||||
order for the end user to call your addon code.
|
||||
|
||||
5. Your addon APIs will always be available within static library builds of
|
||||
BLIS, but if you want your addon APIs to be exported as public APIs within
|
||||
*shared* library builds of BLIS, you'll need to annotate the prototypes
|
||||
accordingly. (BLIS makes its shared library symbols private by default; this
|
||||
allows us to export only those functions that we consider to be part of the
|
||||
public APIs.) This annotation can be done by prefixing function prototypes
|
||||
with the `BLIS_EXPORT_ADDON` macro as follows:
|
||||
```c
|
||||
BLIS_EXPORT_ADDON void foobar_calc( void* a, void* b );
|
||||
```
|
||||
|
||||
6. Do not define any symbols in your addon that conflict with any symbols within
|
||||
the core framework. For example, don't define a function called `bli_copym()`
|
||||
in your addon since that function is already defined within BLIS.
|
||||
|
||||
7. Do not define any symbols in your addon that conflict with any symbols within
|
||||
the C99 standard libraries/headers. For example, don't define a function called
|
||||
`printf()` since that function is already defined within the C99 standard library.
|
||||
|
||||
8. *Try* to not define any symbols in your addon that conflict with symbols in any
|
||||
other addon, unless your addon is meant to serve as an alternative to the
|
||||
conflicting addon, in which case conflicting symbol names is okay (since you
|
||||
will presumably never build with both addons enabled).
|
||||
|
||||
9. When choosing names for your addon files, avoid source filenames that already
|
||||
exist within BLIS. For example, don't name one of your files `bli_obj.c`
|
||||
since that file would compile into `bli_obj.o`, which will have already been
|
||||
placed into the library by the build system.
|
||||
|
||||
10. Similarly, avoid header filenames that already exist within BLIS or C99.
|
||||
For example, don't name one of your header files `bli_obj.h` since that file
|
||||
already exists in BLIS. Also, don't name one of your header files `math.h`
|
||||
since that name would conflict with the `math.h` defined by C99. (This also
|
||||
means you shouldn't name your addon `math` since normally that name would
|
||||
require that you provide a `math.h` header inside the addon directory.)
|
||||
|
||||
If you follow these rules, you will be much more likely to have a pleasant
|
||||
experience integrating your BLIS addon into the larger framework.
|
||||
|
||||
## Caveats
|
||||
|
||||
Notice that the BLIS addons are limited in what they can accomplish. Generally
|
||||
speaking, addons cannot change existing implementations within BLIS. Instead,
|
||||
addons aim to provide a way to quickly augment BLIS with additional bundles of
|
||||
code that extend BLIS's set of functionality in some interesting way. If you
|
||||
want to define new BLAS-like functions, but don't know where to start, creating
|
||||
a new addon is an appropriate place to start experimenting. If you want to
|
||||
change or refactor existing BLIS code, an addon is probably not suited for your
|
||||
needs.
|
||||
|
||||
Another important limitation is the fact that the build system currently uses
|
||||
"framework `CFLAGS`" when compiling the addon source files. These are the same
|
||||
`CFLAGS` used when compiling general framework source code,
|
||||
```
|
||||
# Example framework CFLAGS used by 'haswell' sub-configuration
|
||||
-O2 -Wall -Wno-unused-function -Wfatal-errors -fPIC -std=c99
|
||||
-D_POSIX_C_SOURCE=200112L -Iinclude/haswell -I./frame/3/
|
||||
-I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include
|
||||
-DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden
|
||||
```
|
||||
which are likely more general-purpose than the `CFLAGS` used for, say,
|
||||
optimized kernels or even reference kernels:
|
||||
```
|
||||
# Example optimized kernel CFLAGS used by 'haswell' sub-configuration
|
||||
-O3 -fomit-frame-pointer -mavx2 -mfma -mfpmath=sse -march=haswell -Wall
|
||||
-Wno-unused-function -Wfatal-errors -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L
|
||||
-Iinclude/haswell -I./frame/3/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/
|
||||
-I./frame/include -DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden
|
||||
```
|
||||
(To see precisely which flags are being employed for any given file, enable
|
||||
verbosity at compile-time via `make V=1`.) Compiling addons with these more
|
||||
versatile `CFLAGS` compiler options means that we only need to compile one
|
||||
instance of each addon source file, even when targeting multiple
|
||||
configurations (for example, via `./configure x86_64`). However, it also means
|
||||
that addons are not ideal for microkernels, as they sometimes need additional
|
||||
compiler flags in order to
|
||||
yield the highest performance. If you have a new microkernel you would like to
|
||||
use within an addon, you can always develop it within that addon. However,
|
||||
once it is stable and ready for use by others, it's best to move the kernel(s)
|
||||
to the appropriate microarchitecture-specific subdirectory of the `kernels`
|
||||
directory the kernel(s). This will allow the kernel to be compiled with the
|
||||
appropriate microarchitecture-specific compiler flags.
|
||||
Please see the
|
||||
[Configuration Guide](ConfigurationHowTo)
|
||||
for more details, and when in doubt, please don't be shy about seeking
|
||||
guidance from BLIS developers by opening a
|
||||
[new issue](https://github.com/flame/blis/issues) or sending a message to the
|
||||
[blis-devel](http://groups.google.com/d/forum/blis-devel) mailing list.
|
||||
|
||||
Notwithstanding these limitations, hopefully you still find BLIS addons
|
||||
useful!
|
||||
|
||||
## Known issues
|
||||
|
||||
* None yet.
|
||||
|
||||
## Conclusion
|
||||
|
||||
If you encounter any problems, please open
|
||||
a new [issue on GitHub](https://github.com/flame/blis/issues).
|
||||
|
||||
If you are unsure about how something works, you can still open an issue. Or, you
|
||||
can send a message to
|
||||
[blis-devel](https://groups.google.com/d/forum/blis-devel) mailing list.
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
|
||||
// This string gets defined via -D on the command line when BLIS is compiled.
|
||||
// This string is (or rather, should be) only used here.
|
||||
static char* bli_version_str = BLIS_VERSION_STRING;
|
||||
static char* bli_version_str = "4.0"; //BLIS_VERSION_STRING;
|
||||
static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
|
||||
|
||||
char* bli_info_get_version_str( void ) { return bli_version_str; }
|
||||
|
||||
@@ -241,8 +241,9 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define BLIS_EXPORT_BLIS BLIS_EXPORT
|
||||
#define BLIS_EXPORT_BLAS BLIS_EXPORT
|
||||
#define BLIS_EXPORT_BLIS BLIS_EXPORT
|
||||
#define BLIS_EXPORT_BLAS BLIS_EXPORT
|
||||
#define BLIS_EXPORT_ADDON BLIS_EXPORT
|
||||
|
||||
|
||||
// -- STATIC INLINE FUNCTIONS --------------------------------------------------
|
||||
|
||||
@@ -186,6 +186,14 @@ extern "C" {
|
||||
#include "bli_util.h"
|
||||
|
||||
|
||||
// -- addon definitions --
|
||||
|
||||
// NOTE: These definitions should not be included much earlier since an addon
|
||||
// may wish to utilize other types and definitions provided by BLIS.
|
||||
|
||||
#include "bli_addon.h"
|
||||
|
||||
|
||||
// -- sandbox implementation --
|
||||
|
||||
#include "bli_sbox.h"
|
||||
|
||||
Reference in New Issue
Block a user