diff --git a/.gitignore b/.gitignore index 49b22c2b8..a24fe2b0e 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ config.mk bli_config.h +bli_addon.h # -- monolithic headers -- diff --git a/Makefile b/Makefile index b5e036744..992983328 100644 --- a/Makefile +++ b/Makefile @@ -114,6 +114,7 @@ BASE_OBJ_CONFIG_PATH := $(BASE_OBJ_PATH)/$(CONFIG_DIR) BASE_OBJ_FRAME_PATH := $(BASE_OBJ_PATH)/$(FRAME_DIR) BASE_OBJ_REFKERN_PATH := $(BASE_OBJ_PATH)/$(REFKERN_DIR) BASE_OBJ_KERNELS_PATH := $(BASE_OBJ_PATH)/$(KERNELS_DIR) +BASE_OBJ_ADDON_PATH := $(BASE_OBJ_PATH)/$(ADDON_DIR) BASE_OBJ_SANDBOX_PATH := $(BASE_OBJ_PATH)/$(SANDBOX_DIR) # --- Define install target names for static libraries --- @@ -210,6 +211,10 @@ MK_REFKERN_OBJS := $(foreach arch, $(CONFIG_LIST), \ # Generate object file paths for all of the portable framework source code. MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH)) +# Generate object file paths for the addon source code. If one or more addons +# were not enabled a configure-time, this variable will we empty. +MK_ADDON_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH)) + # Generate object file paths for the sandbox source code. If a sandbox was not # enabled a configure-time, this variable will we empty. MK_SANDBOX_OBJS := $(call gen-obj-paths-from-src,$(SANDBOX_SRC_SUFS),$(MK_SANDBOX_SRC),$(SANDBOX_PATH),$(BASE_OBJ_SANDBOX_PATH)) @@ -219,6 +224,7 @@ MK_BLIS_OBJS := $(MK_CONFIG_OBJS) \ $(MK_KERNELS_OBJS) \ $(MK_REFKERN_OBJS) \ $(MK_FRAME_OBJS) \ + $(MK_ADDON_OBJS) \ $(MK_SANDBOX_OBJS) # Optionally filter out the BLAS and CBLAS compatibility layer object files. @@ -549,6 +555,28 @@ else endif endef +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +define make-c99-addon-rule +$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-c99text-for,$(1)) + @$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@ +endif +endef + +define make-cxx-addon-rule +$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-cxxtext-for,$(1)) + @$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@ +endif +endef + # first argument: a configuration name from the union of config_list and # config_name, used to look up the CFLAGS to use during compilation. define make-c99-sandbox-rule @@ -601,6 +629,16 @@ $(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf)))) $(foreach suf, $(KERNELS_SRC_SUFS), \ $(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf))))) +# Instantiate the build rule for C addon files. Use the CFLAGS for the +# configuration family. +$(foreach suf, $(ADDON_C99_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf))))) + +# Instantiate the build rule for C++ addon files. Use the CFLAGS for the +# configuration family. +$(foreach suf, $(ADDON_CXX_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-cxx-addon-rule,$(conf),$(suf))))) + # Instantiate the build rule for C sandbox files. Use the CFLAGS for the # configuration family. $(foreach suf, $(SANDBOX_C99_SUFS), \ @@ -1078,6 +1116,9 @@ ifeq ($(ENABLE_VERBOSE),yes) - $(FIND) $(FRAME_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) - $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) - $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +ifneq ($(ADDON_LIST),) + - $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +endif ifneq ($(SANDBOX),) - $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) endif @@ -1090,6 +1131,10 @@ else @- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) @echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)" @- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +ifneq ($(ADDON_LIST),) + @echo "Removing makefile fragments from $(ADDON_FRAG_PATH)" + @- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +endif ifneq ($(SANDBOX),) @echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)" @- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) @@ -1210,6 +1255,7 @@ endif # IS_CONFIGURED distclean: cleanmk cleanh cleanlib cleantest ifeq ($(IS_CONFIGURED),yes) ifeq ($(ENABLE_VERBOSE),yes) + - $(RM_F) $(BLIS_ADDON_H) - $(RM_F) $(BLIS_CONFIG_H) - $(RM_F) $(CONFIG_MK_FILE) - $(RM_F) $(PC_OUT_FILE) @@ -1217,6 +1263,8 @@ ifeq ($(ENABLE_VERBOSE),yes) - $(RM_RF) $(LIB_DIR) - $(RM_RF) $(INCLUDE_DIR) else + @echo "Removing $(BLIS_ADDON_H)" + @$(RM_F) $(BLIS_ADDON_H) @echo "Removing $(BLIS_CONFIG_H)" @$(RM_F) $(BLIS_CONFIG_H) @echo "Removing $(CONFIG_MK_FILE)" diff --git a/addon/gemmd/attic/bli_gemm_ex.c b/addon/gemmd/attic/bli_gemm_ex.c new file mode 100644 index 000000000..0f40d1cb3 --- /dev/null +++ b/addon/gemmd/attic/bli_gemm_ex.c @@ -0,0 +1,88 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // A switch to easily toggle whether we use the addon implementation + // of bao_gemmd() as the implementation for bli_gemm(). (This allows for + // easy testing of bao_gemmd() via the testsuite.) + if ( 1 ) + { + const dim_t k = bli_obj_width_after_trans( a ); + const num_t dt = bli_obj_dt( c ); + obj_t d; + + bli_obj_create( dt, k, 1, 1, k, &d ); + bli_setv( &BLIS_ONE, &d ); + //bli_randv( &d ); + + bao_gemmd_ex( alpha, a, &d, b, beta, c, cntx, rntm ); + + bli_obj_free( &d ); + return; + } + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Obtain a valid (native) context from the gks if necessary. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front end. + bli_gemm_front + ( + alpha, a, b, beta, c, cntx, rntm, NULL + ); +} + diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c new file mode 100644 index 000000000..71d49806b --- /dev/null +++ b/addon/gemmd/bao_gemmd.c @@ -0,0 +1,305 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// -- Define the gemmd operation's object API ---------------------------------- +// + +void bao_gemmd + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bao_gemmd_ex + ( + alpha, + a, + d, + b, + beta, + c, + NULL, + NULL + ); +} + +void bao_gemmd_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Obtain a valid (native) context from the gks if necessary. + // NOTE: This must be done before calling the _check() function, since + // that function assumes the context pointer is valid. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bao_gemmd_check( alpha, a, d, b, beta, c, cntx ); + + // -- bli_gemmd_front() ---------------------------------------------------- + + obj_t a_local; + obj_t b_local; + obj_t c_local; + + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( c ) ) + { + return; + } + + // If alpha is zero, or if A or B has a zero dimension, scale C by beta + // and return early. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) || + bli_obj_has_zero_dim( a ) || + bli_obj_has_zero_dim( b ) ) + { + bli_scalm( beta, c ); + return; + } + + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( a, &a_local ); + bli_obj_alias_to( b, &b_local ); + bli_obj_alias_to( c, &c_local ); + + // Induce a transposition of A if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &a_local ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } + + // Induce a transposition of B if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &b_local ) ) + { + bli_obj_induce_trans( &b_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local ); + } + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + } + + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + + // Spawn threads (if applicable), where bao_gemmd_int() is the thread entry + // point function for each thread. This also begins the process of creating + // the thrinfo_t tree, which contains thread communicators. + bao_l3_thread_decorator + ( + bao_gemmd_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + d, + &b_local, + beta, + &c_local, + cntx, + rntm + ); +} + +// +// -- Define the gemmd operation's thread entry point -------------------------- +// + +void bao_gemmd_int + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + // In this function, we choose the gemmd implementation that is executed + // on each thread. + +#if 1 + // Call the block-panel algorithm that calls the kernel directly, which + // exposes edge-case handling. + bao_gemmd_bp_var1 + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm, + thread + ); +#else + // Call the block-panel algorithm that calls the kernel indirectly via a + // wrapper function, which hides edge-case handling. + bao_gemmd_bp_var2 + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm, + thread + ); +#endif +} + +// +// -- Define the gemmd operation's typed API ----------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* d, inc_t incd, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + bli_init_once(); \ +\ + /* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on + the macro parameter 'ch' (e.g. s, d, etc). */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, dd, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + /* Adjust the dimensions of matrices A and B according to the transa and + transb parameters. */ \ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \ +\ + /* Create bufferless scalar objects and attach the provided scalar pointers + to those scalar objects. */ \ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + /* Create bufferless matrix objects and attach the provided matrix pointers + to those matrix objects. */ \ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, k, 1, d, incd, k, &dd ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + /* Set the transposition/conjugation properties of the objects for matrices + A and B. */ \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + /* Call the object interface. */ \ + PASTECH(bao_,opname) \ + ( \ + &alphao, \ + &ao, \ + &dd, \ + &bo, \ + &betao, \ + &co \ + ); \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd ) +GENTFUNC( float, s, gemmd ) +GENTFUNC( double, d, gemmd ) +GENTFUNC( scomplex, c, gemmd ) +GENTFUNC( dcomplex, z, gemmd ) + diff --git a/addon/gemmd/bao_gemmd.h b/addon/gemmd/bao_gemmd.h new file mode 100644 index 000000000..7c7466494 --- /dev/null +++ b/addon/gemmd/bao_gemmd.h @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// -- Prototype the gemmd operation's object API ------------------------------- +// + +BLIS_EXPORT_ADDON void bao_gemmd + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_ADDON void bao_gemmd_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// +// -- Prototype the gemmd operation's thread entry point ----------------------- +// + +void bao_gemmd_int + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// +// -- Prototype the gemmd operation's typed API -------------------------------- +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* d, inc_t incd, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +//INSERT_GENTPROT_BASIC0( gemmd ) +GENTPROT( float, s, gemmd ) +GENTPROT( double, d, gemmd ) +GENTPROT( scomplex, c, gemmd ) +GENTPROT( dcomplex, z, gemmd ) + diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c new file mode 100644 index 000000000..e042f1fd8 --- /dev/null +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -0,0 +1,530 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmd_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict d, inc_t incd, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemmd-like block-panel algorithm (object interface) ---------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bao_?gemmd_bp_var1(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1); + +void bao_gemmd_bp_var1 + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_d = bli_obj_buffer_at_off( d ); + const inc_t incd = bli_obj_vector_inc( d ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_d, incd, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemmd-like block-panel algorithm (typed interface) ----------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_d = incd; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict d_00 = d; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ + ctype zero_local = *PASTEMAC(ch,0); \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict d_pc = d_00 + pp * pcstep_d; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + d_pc, incd, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + d_pc, incd, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( mr_cur == MR && nr_cur == NR ) \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + &zero_local, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + mr_cur, \ + nr_cur, \ + ct, rs_ct, cs_ct, \ + beta_use, \ + c_ir, rs_c, cs_c \ + ); \ + } \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bao_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bao_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd_bp_var1 ) +GENTFUNC( float, s, gemmd_bp_var1 ) +GENTFUNC( double, d, gemmd_bp_var1 ) +GENTFUNC( scomplex, c, gemmd_bp_var1 ) +GENTFUNC( dcomplex, z, gemmd_bp_var1 ) + diff --git a/addon/gemmd/bao_gemmd_bp_var2.c b/addon/gemmd/bao_gemmd_bp_var2.c new file mode 100644 index 000000000..a0040fec0 --- /dev/null +++ b/addon/gemmd/bao_gemmd_bp_var2.c @@ -0,0 +1,602 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmd_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict d, inc_t incd, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemmd-like block-panel algorithm (object interface) ---------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bao_?gemmd_bp_var2(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var2); + +void bao_gemmd_bp_var2 + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_d = bli_obj_buffer_at_off( d ); + const inc_t incd = bli_obj_vector_inc( d ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_d, incd, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemmd-like block-panel algorithm (typed interface) ----------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + /* + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + */ \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + /* + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ + */ \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_d = incd; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict d_00 = d; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ + /*ctype zero_local = *PASTEMAC(ch,0);*/ \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict d_pc = d_00 + pp * pcstep_d; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + d_pc, incd, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + d_pc, incd, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Call a wrapper to the kernel (which handles edge cases). */ \ + PASTECH2(bao_,ch,gemm_kernel) \ + ( \ + MR, \ + NR, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + &alpha_local, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bao_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bao_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd_bp_var2 ) +GENTFUNC( float, s, gemmd_bp_var2 ) +GENTFUNC( double, d, gemmd_bp_var2 ) +GENTFUNC( scomplex, c, gemmd_bp_var2 ) +GENTFUNC( dcomplex, z, gemmd_bp_var2 ) + +// +// -- gemm-like microkernel wrapper -------------------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t kc_cur, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* Infer the datatype from the ctype. */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype zero = *PASTEMAC(ch,0); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( mr_cur == MR && nr_cur == NR ) \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + &zero, \ + ct, rs_ct, cs_ct, \ + aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + mr_cur, \ + nr_cur, \ + ct, rs_ct, cs_ct, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( gemm_kernel ) +GENTFUNC( float, s, gemm_kernel ) +GENTFUNC( double, d, gemm_kernel ) +GENTFUNC( scomplex, c, gemm_kernel ) +GENTFUNC( dcomplex, z, gemm_kernel ) + diff --git a/addon/gemmd/bao_gemmd_check.c b/addon/gemmd/bao_gemmd_check.c new file mode 100644 index 000000000..864e9a1ac --- /dev/null +++ b/addon/gemmd/bao_gemmd_check.c @@ -0,0 +1,131 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bao_gemmd_check + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_noninteger_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_noninteger_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( c ); + bli_check_error_code( e_val ); + + // Check scalar/vector/matrix type. + + e_val = bli_check_scalar_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( c ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( c ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_level3_dims( a, b, c ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + // NOTE: We only perform these tests when mixed datatype support is + // disabled. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); +} + diff --git a/addon/gemmd/bao_gemmd_check.h b/addon/gemmd/bao_gemmd_check.h new file mode 100644 index 000000000..243ec70c8 --- /dev/null +++ b/addon/gemmd/bao_gemmd_check.h @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based check functions. +// + +void bao_gemmd_check + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx + ); + diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/gemmd/bao_gemmd_var.h new file mode 100644 index 000000000..5c6674727 --- /dev/null +++ b/addon/gemmd/bao_gemmd_var.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype the object-based variant interfaces. +// + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTECH(bao_,opname) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* d, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ); + +GENPROT( gemmd_bp_var1 ) +GENPROT( gemmd_bp_var2 ) + + +// +// Prototype the typed variant interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( gemmd_bp_var1 ) +GENTPROT( float, s, gemmd_bp_var1 ) +GENTPROT( double, d, gemmd_bp_var1 ) +GENTPROT( scomplex, c, gemmd_bp_var1 ) +GENTPROT( dcomplex, z, gemmd_bp_var1 ) + +//INSERT_GENTPROT_BASIC0( gemmd_bp_var2 ) +GENTPROT( float, s, gemmd_bp_var2 ) +GENTPROT( double, d, gemmd_bp_var2 ) +GENTPROT( scomplex, c, gemmd_bp_var2 ) +GENTPROT( dcomplex, z, gemmd_bp_var2 ) + + +// +// Prototype the typed kernel interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ); + +//INSERT_GENTPROT_BASIC0( gemm_kernel ) +GENTPROT( float, s, gemm_kernel ) +GENTPROT( double, d, gemm_kernel ) +GENTPROT( scomplex, c, gemm_kernel ) +GENTPROT( dcomplex, z, gemm_kernel ) + diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c new file mode 100644 index 000000000..49bb34664 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_a.c @@ -0,0 +1,330 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + /* Set the pack buffer type so that we are obtaining memory blocks from + the pool dedicated to blocks of A. */ \ + const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \ +\ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ + const dim_t k_pack = k; \ +\ + /* Barrier to make sure all threads are caught up and ready to begin the + packm stage. */ \ + bli_thread_barrier( thread ); \ +\ + /* Compute the size of the memory block eneded. */ \ + siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ +\ + /* Check the mem_t entry provided by the caller. If it is unallocated, + then we need to acquire a block from the packed block allocator. */ \ + if ( bli_mem_is_unalloc( mem ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Acquire directly to the chief thread's mem_t that was passed in. + It needs to be that mem_t struct, and not a local (temporary) + mem_t, since there is no barrier until after packing is finished, + which could allow a race condition whereby the chief thread exits + the current function before the other threads have a chance to + copy from it. (A barrier would fix that race condition, but then + again, I prefer to keep barriers to a minimum.) */ \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t to all + threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else /* if ( bli_mem_is_alloc( mem ) ) */ \ + { \ + /* If the mem_t entry provided by the caller does NOT contain a NULL + buffer, then a block has already been acquired from the packed + block allocator and cached by the caller. */ \ +\ + /* As a sanity check, we should make sure that the mem_t object isn't + associated with a block that is too small compared to the size of + the packed matrix buffer that is needed, according to the value + computed above. */ \ + siz_t mem_size = bli_mem_size( mem ); \ +\ + if ( mem_size < size_needed ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* The chief thread releases the existing block associated + with the mem_t, and then re-acquires a new block, saving + the associated mem_t to its passed-in mem_t. (See coment + above for why the acquisition needs to be directly to + the chief thread's passed-in mem_t and not a local + (temporary) mem_t. */ \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t + to all threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else \ + { \ + /* If the mem_t entry is already allocated and sufficiently large, + then we use it as-is. No action is needed. */ \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_mem_a ) +GENTFUNC( float, s, packm_init_mem_a ) +GENTFUNC( double, d, packm_init_mem_a ) +GENTFUNC( scomplex, c, packm_init_mem_a ) +GENTFUNC( dcomplex, z, packm_init_mem_a ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + if ( thread != NULL ) \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Check the mem_t entry provided by the caller. Only proceed if it + is allocated, which it should be. */ \ + if ( bli_mem_is_alloc( mem ) ) \ + { \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a ) +GENTFUNC( float, s, packm_finalize_mem_a ) +GENTFUNC( double, d, packm_finalize_mem_a ) +GENTFUNC( scomplex, c, packm_finalize_mem_a ) +GENTFUNC( dcomplex, z, packm_finalize_mem_a ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* restrict m_max, \ + dim_t* restrict k_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ) \ +{ \ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + *m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ + *k_max = k; \ +\ + /* Determine the dimensions and strides for the packed matrix A. */ \ + { \ + /* Pack A to column-stored row-panels. */ \ + *rs_p = 1; \ + *cs_p = mr; \ +\ + *pd_p = mr; \ + *ps_p = mr * k; \ +\ + /* Set the schema to "packed row panels" to indicate packing to + conventional column-stored row panels. */ \ + *schema = BLIS_PACKED_ROW_PANELS; \ + } \ +\ + /* Set the buffer address provided by the caller to point to the memory + associated with the mem_t entry acquired from the memory pool. */ \ + *p = bli_mem_buffer( mem ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_a ) +GENTFUNC( float, s, packm_init_a ) +GENTFUNC( double, d, packm_init_a ) +GENTFUNC( scomplex, c, packm_init_a ) +GENTFUNC( dcomplex, z, packm_init_a ) + + +// +// Define BLAS-like interfaces to the variant chooser. +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + pack_t schema; \ + dim_t m_max; \ + dim_t k_max; \ + dim_t pd_p; \ +\ + /* Prepare the packing destination buffer. */ \ + PASTECH2(bao_,ch,packm_init_mem_a) \ + ( \ + m_alloc, k_alloc, mr, \ + cntx, \ + rntm, \ + mem, \ + thread \ + ); \ +\ + /* Determine the packing buffer and related parameters for matrix A. */ \ + PASTECH2(bao_,ch,packm_init_a) \ + ( \ + &schema, \ + m, k, mr, \ + &m_max, &k_max, \ + p, rs_p, cs_p, \ + &pd_p, ps_p, \ + mem \ + ); \ +\ + /* Pack matrix A to the destination buffer chosen above. Here, the packed + matrix is stored to column-stored MR x k micropanels. */ \ + PASTECH2(bao_,ch,packm_var1) \ + ( \ + conj, \ + schema, \ + m, \ + k, \ + m_max, \ + k_max, \ + kappa, \ + d, incd, \ + a, rs_a, cs_a, \ + *p, *rs_p, *cs_p, \ + pd_p, *ps_p, \ + cntx, \ + thread \ + ); \ +\ + /* Barrier so that packing is done before computation. */ \ + bli_thread_barrier( thread ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_a ) +GENTFUNC( float, s, packm_a ) +GENTFUNC( double, d, packm_a ) +GENTFUNC( scomplex, c, packm_a ) +GENTFUNC( dcomplex, z, packm_a ) + diff --git a/addon/gemmd/bao_l3_packm_a.h b/addon/gemmd/bao_l3_packm_a.h new file mode 100644 index 000000000..b683b79d4 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_a.h @@ -0,0 +1,123 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_mem_a ) +GENTPROT( float, s, packm_init_mem_a ) +GENTPROT( double, d, packm_init_mem_a ) +GENTPROT( scomplex, c, packm_init_mem_a ) +GENTPROT( dcomplex, z, packm_init_mem_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a ) +GENTPROT( float, s, packm_finalize_mem_a ) +GENTPROT( double, d, packm_finalize_mem_a ) +GENTPROT( scomplex, c, packm_finalize_mem_a ) +GENTPROT( dcomplex, z, packm_finalize_mem_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* restrict m_max, \ + dim_t* restrict k_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_a ) +GENTPROT( float, s, packm_init_a ) +GENTPROT( double, d, packm_init_a ) +GENTPROT( scomplex, c, packm_init_a ) +GENTPROT( dcomplex, z, packm_init_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_a ) +GENTPROT( float, s, packm_a ) +GENTPROT( double, d, packm_a ) +GENTPROT( scomplex, c, packm_a ) +GENTPROT( dcomplex, z, packm_a ) + diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c new file mode 100644 index 000000000..c41b062b6 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_b.c @@ -0,0 +1,330 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + /* Set the pack buffer type so that we are obtaining memory blocks from + the pool dedicated to panels of B. */ \ + const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \ +\ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + const dim_t k_pack = k; \ + const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ +\ + /* Barrier to make sure all threads are caught up and ready to begin the + packm stage. */ \ + bli_thread_barrier( thread ); \ +\ + /* Compute the size of the memory block eneded. */ \ + siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ +\ + /* Check the mem_t entry provided by the caller. If it is unallocated, + then we need to acquire a block from the packed block allocator. */ \ + if ( bli_mem_is_unalloc( mem ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Acquire directly to the chief thread's mem_t that was passed in. + It needs to be that mem_t struct, and not a local (temporary) + mem_t, since there is no barrier until after packing is finished, + which could allow a race condition whereby the chief thread exits + the current function before the other threads have a chance to + copy from it. (A barrier would fix that race condition, but then + again, I prefer to keep barriers to a minimum.) */ \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t to all + threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else /* if ( bli_mem_is_alloc( mem ) ) */ \ + { \ + /* If the mem_t entry provided by the caller does NOT contain a NULL + buffer, then a block has already been acquired from the packed + block allocator and cached by the caller. */ \ +\ + /* As a sanity check, we should make sure that the mem_t object isn't + associated with a block that is too small compared to the size of + the packed matrix buffer that is needed, according to the value + computed above. */ \ + siz_t mem_size = bli_mem_size( mem ); \ +\ + if ( mem_size < size_needed ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* The chief thread releases the existing block associated + with the mem_t, and then re-acquires a new block, saving + the associated mem_t to its passed-in mem_t. (See coment + above for why the acquisition needs to be directly to + the chief thread's passed-in mem_t and not a local + (temporary) mem_t. */ \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t + to all threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else \ + { \ + /* If the mem_t entry is already allocated and sufficiently large, + then we use it as-is. No action is needed. */ \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_mem_b ) +GENTFUNC( float, s, packm_init_mem_b ) +GENTFUNC( double, d, packm_init_mem_b ) +GENTFUNC( scomplex, c, packm_init_mem_b ) +GENTFUNC( dcomplex, z, packm_init_mem_b ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + if ( thread != NULL ) \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Check the mem_t entry provided by the caller. Only proceed if it + is allocated, which it should be. */ \ + if ( bli_mem_is_alloc( mem ) ) \ + { \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b ) +GENTFUNC( float, s, packm_finalize_mem_b ) +GENTFUNC( double, d, packm_finalize_mem_b ) +GENTFUNC( scomplex, c, packm_finalize_mem_b ) +GENTFUNC( dcomplex, z, packm_finalize_mem_b ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* restrict k_max, \ + dim_t* restrict n_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ) \ +{ \ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + *k_max = k; \ + *n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ +\ + /* Determine the dimensions and strides for the packed matrix B. */ \ + { \ + /* Pack B to row-stored column-panels. */ \ + *rs_p = nr; \ + *cs_p = 1; \ +\ + *pd_p = nr; \ + *ps_p = k * nr; \ +\ + /* Set the schema to "packed column panels" to indicate packing to + conventional row-stored column panels. */ \ + *schema = BLIS_PACKED_COL_PANELS; \ + } \ +\ + /* Set the buffer address provided by the caller to point to the memory + associated with the mem_t entry acquired from the memory pool. */ \ + *p = bli_mem_buffer( mem ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_b ) +GENTFUNC( float, s, packm_init_b ) +GENTFUNC( double, d, packm_init_b ) +GENTFUNC( scomplex, c, packm_init_b ) +GENTFUNC( dcomplex, z, packm_init_b ) + + +// +// Define BLAS-like interfaces to the variant chooser. +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + pack_t schema; \ + dim_t k_max; \ + dim_t n_max; \ + dim_t pd_p; \ +\ + /* Prepare the packing destination buffer. */ \ + PASTECH2(bao_,ch,packm_init_mem_b) \ + ( \ + k_alloc, n_alloc, nr, \ + cntx, \ + rntm, \ + mem, \ + thread \ + ); \ +\ + /* Determine the packing buffer and related parameters for matrix B. */ \ + PASTECH2(bao_,ch,packm_init_b) \ + ( \ + &schema, \ + k, n, nr, \ + &k_max, &n_max, \ + p, rs_p, cs_p, \ + &pd_p, ps_p, \ + mem \ + ); \ +\ + /* Pack matrix B to the destination buffer chosen above. Here, the packed + matrix is stored to row-stored k x NR micropanels. */ \ + PASTECH2(bao_,ch,packm_var1) \ + ( \ + conj, \ + schema, \ + k, \ + n, \ + k_max, \ + n_max, \ + kappa, \ + d, incd, \ + b, rs_b, cs_b, \ + *p, *rs_p, *cs_p, \ + pd_p, *ps_p, \ + cntx, \ + thread \ + ); \ +\ + /* Barrier so that packing is done before computation. */ \ + bli_thread_barrier( thread ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_b ) +GENTFUNC( float, s, packm_b ) +GENTFUNC( double, d, packm_b ) +GENTFUNC( scomplex, c, packm_b ) +GENTFUNC( dcomplex, z, packm_b ) + diff --git a/addon/gemmd/bao_l3_packm_b.h b/addon/gemmd/bao_l3_packm_b.h new file mode 100644 index 000000000..9161604ce --- /dev/null +++ b/addon/gemmd/bao_l3_packm_b.h @@ -0,0 +1,123 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_mem_b ) +GENTPROT( float, s, packm_init_mem_b ) +GENTPROT( double, d, packm_init_mem_b ) +GENTPROT( scomplex, c, packm_init_mem_b ) +GENTPROT( dcomplex, z, packm_init_mem_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b ) +GENTPROT( float, s, packm_finalize_mem_b ) +GENTPROT( double, d, packm_finalize_mem_b ) +GENTPROT( scomplex, c, packm_finalize_mem_b ) +GENTPROT( dcomplex, z, packm_finalize_mem_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* restrict k_max, \ + dim_t* restrict n_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_b ) +GENTPROT( float, s, packm_init_b ) +GENTPROT( double, d, packm_init_b ) +GENTPROT( scomplex, c, packm_init_b ) +GENTPROT( dcomplex, z, packm_init_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_b ) +GENTPROT( float, s, packm_b ) +GENTPROT( double, d, packm_b ) +GENTPROT( scomplex, c, packm_b ) +GENTPROT( dcomplex, z, packm_b ) + diff --git a/addon/gemmd/bao_l3_packm_var.h b/addon/gemmd/bao_l3_packm_var.h new file mode 100644 index 000000000..063e59e5f --- /dev/null +++ b/addon/gemmd/bao_l3_packm_var.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// Prototype BLAS-like interfaces to the variants. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( packm_var1 ) +GENTPROT( float, s, packm_var1 ) +GENTPROT( double, d, packm_var1 ) +GENTPROT( scomplex, c, packm_var1 ) +GENTPROT( dcomplex, z, packm_var1 ) + +//INSERT_GENTPROT_BASIC0( packm_var2 ) +GENTPROT( float, s, packm_var2 ) +GENTPROT( double, d, packm_var2 ) +GENTPROT( scomplex, c, packm_var2 ) +GENTPROT( dcomplex, z, packm_var2 ) diff --git a/addon/gemmd/bao_l3_packm_var1.c b/addon/gemmd/bao_l3_packm_var1.c new file mode 100644 index 000000000..24c0a2cc1 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_var1.c @@ -0,0 +1,195 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Variant 1 provides basic support for packing by calling packm_cxk(). +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + dim_t panel_dim; \ + dim_t panel_dim_max; \ + inc_t incc; \ + inc_t ldc; \ + inc_t ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool row_stored = bli_is_col_packed( schema ); \ + /*bool col_stored = bli_is_row_packed( schema );*/ \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + } \ +\ + ctype* restrict p_begin = p_cast; \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, it = 0; it < n_iter; \ + ic += ic_inc, it += 1 ) \ + { \ + panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + ctype* restrict c_begin = c_cast + (ic )*incc; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ +\ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. (The + default is slab.) */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + { \ + PASTECH2(bao_,ch,packm_cxk) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_dim_max, \ + panel_len, \ + panel_len_max, \ + kappa_cast, \ + d, incd, \ + c_use, incc, ldc, \ + p_use, ldp, \ + cntx \ + ); \ + } \ +\ +/* +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ +\ + p_begin += ps_p; \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var1 ) +GENTFUNC( float, s, packm_var1 ) +GENTFUNC( double, d, packm_var1 ) +GENTFUNC( scomplex, c, packm_var1 ) +GENTFUNC( dcomplex, z, packm_var1 ) + diff --git a/addon/gemmd/bao_l3_packm_var2.c b/addon/gemmd/bao_l3_packm_var2.c new file mode 100644 index 000000000..830e499b3 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_var2.c @@ -0,0 +1,245 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Variant 2 is similar to variant 1, but inlines the contents of packm_cxk(). +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + dim_t panel_dim; \ + dim_t panel_dim_max; \ + inc_t incc; \ + inc_t ldc; \ + inc_t ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool row_stored = bli_is_col_packed( schema ); \ + /*bool col_stored = bli_is_row_packed( schema );*/ \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + } \ +\ + ctype* restrict p_begin = p_cast; \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, it = 0; it < n_iter; \ + ic += ic_inc, it += 1 ) \ + { \ + panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + ctype* restrict c_begin = c_cast + (ic )*incc; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ +\ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. (The + default is slab.) */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + { \ + /* NOTE: We assume here that kappa = 1 and therefore ignore it. If + we're wrong, this will get someone's attention. */ \ + if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + bli_abort(); \ +\ + /* Perform the packing, taking conjc into account. */ \ + if ( bli_is_conj( conjc ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copyjs)( *cld, *pld ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copys)( *cld, *pld ); \ + } \ + } \ + } \ +\ + /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ + if ( panel_dim < panel_dim_max ) \ + { \ + const dim_t i = panel_dim; \ + const dim_t m_edge = panel_dim_max - panel_dim; \ + const dim_t n_edge = panel_len_max; \ + ctype* restrict p_edge = p_use + (i )*1; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ +\ + /* If panel_len < panel_len_max, then we zero those unused columns. */ \ + if ( panel_len < panel_len_max ) \ + { \ + const dim_t j = panel_len; \ + const dim_t m_edge = panel_dim_max; \ + const dim_t n_edge = panel_len_max - panel_len; \ + ctype* restrict p_edge = p_use + (j )*ldp; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ + } \ +\ +/* +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ +\ + p_begin += ps_p; \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var1 ) +GENTFUNC( float, s, packm_var2 ) +GENTFUNC( double, d, packm_var2 ) +GENTFUNC( scomplex, c, packm_var2 ) +GENTFUNC( dcomplex, z, packm_var2 ) + diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c new file mode 100644 index 000000000..645f09d79 --- /dev/null +++ b/addon/gemmd/bao_packm_cxk.c @@ -0,0 +1,199 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_dim_max, \ + dim_t panel_len, \ + dim_t panel_len_max, \ + ctype* kappa, \ + ctype* d, inc_t incd, \ + ctype* a, inc_t inca, inc_t lda, \ + ctype* p, inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + /* Note that we use panel_dim_max, not panel_dim, to query the packm + kernel function pointer. This means that we always use the same + kernel, even for edge cases. */ \ + num_t dt = PASTEMAC(ch,type); \ + l1mkr_t ker_id = panel_dim_max; \ +\ + PASTECH2(ch,opname,_ker_ft) f; \ +\ + /* Query the context for the packm kernel corresponding to the current + panel dimension, or kernel id. If the id is invalid, the function will + return NULL. */ \ + f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + /* NOTE: We've disabled calling packm micro-kernels from the context for + this implementation. To re-enable, change FALSE to TRUE in the + conditional below. */ \ + if ( f != NULL && FALSE ) \ + { \ + f \ + ( \ + conja, \ + schema, \ + panel_dim, \ + panel_len, \ + panel_len_max, \ + kappa, \ + a, inca, lda, \ + p, ldp, \ + cntx \ + ); \ + } \ + else \ + { \ + /* NOTE: We assume here that kappa = 1 and therefore ignore it. If + we're wrong, this will get someone's attention. */ \ + if ( !PASTEMAC(ch,eq1)( *kappa ) ) \ + bli_abort(); \ +\ + if ( d == NULL ) \ + { \ + /* Perform the packing, taking conja into account. */ \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + PASTEMAC(ch,copyjs)( *ali, *pli ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + PASTEMAC(ch,copys)( *ali, *pli ); \ + } \ + } \ + } \ + } \ + else /* if ( d != NULL ) */ \ + { \ + /* Perform the packing, taking conja into account. */ \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* dl = d + (l )*incd; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + /* Note that ali must be the second operand here since + that is what is conjugated by scal2js. */ \ + PASTEMAC(ch,scal2js)( *dl, *ali, *pli ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* dl = d + (l )*incd; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + PASTEMAC(ch,scal2s)( *ali, *dl, *pli ); \ + } \ + } \ + } \ + } \ +\ + /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ + if ( panel_dim < panel_dim_max ) \ + { \ + const dim_t i = panel_dim; \ + const dim_t m_edge = panel_dim_max - panel_dim; \ + const dim_t n_edge = panel_len_max; \ + ctype* restrict p_edge = p + (i )*1; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ +\ + /* If panel_len < panel_len_max, then we zero those unused columns. */ \ + if ( panel_len < panel_len_max ) \ + { \ + const dim_t j = panel_len; \ + const dim_t m_edge = panel_dim_max; \ + const dim_t n_edge = panel_len_max - panel_len; \ + ctype* restrict p_edge = p + (j )*ldp; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_cxk ) +GENTFUNC( float, s, packm_cxk ) +GENTFUNC( double, d, packm_cxk ) +GENTFUNC( scomplex, c, packm_cxk ) +GENTFUNC( dcomplex, z, packm_cxk ) + diff --git a/addon/gemmd/bao_packm_cxk.h b/addon/gemmd/bao_packm_cxk.h new file mode 100644 index 000000000..3e977a7cc --- /dev/null +++ b/addon/gemmd/bao_packm_cxk.h @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_dim_max, \ + dim_t panel_len, \ + dim_t panel_len_max, \ + ctype* kappa, \ + ctype* d, inc_t incd, \ + ctype* a, inc_t inca, inc_t lda, \ + ctype* p, inc_t ldp, \ + cntx_t* cntx \ + ); + +//INSERT_GENTPROT_BASIC0( packm_cxk ) +GENTPROT( float, s, packm_cxk ) +GENTPROT( double, d, packm_cxk ) +GENTPROT( scomplex, c, packm_cxk ) +GENTPROT( dcomplex, z, packm_cxk ) + diff --git a/addon/gemmd/gemmd.h b/addon/gemmd/gemmd.h new file mode 100644 index 000000000..cab61bd18 --- /dev/null +++ b/addon/gemmd/gemmd.h @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of copyright holder(s) nor the names + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef GEMMD_H +#define GEMMD_H + +// This header should contain (or #include) any definitions that must be +// folded into blis.h. + +#include "bao_gemmd.h" +#include "bao_gemmd_check.h" +#include "bao_gemmd_var.h" + +#include "bao_l3_packm_a.h" +#include "bao_l3_packm_b.h" +#include "bao_l3_packm_var.h" + +#include "bao_packm_cxk.h" + +#include "bao_l3_decor.h" + + +#endif diff --git a/addon/gemmd/thread/bao_l3_decor.h b/addon/gemmd/thread/bao_l3_decor.h new file mode 100644 index 000000000..b4fd2b9b7 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor.h @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_H +#define BLIS_SBX_L3_DECOR_H + +// -- sup definitions ---------------------------------------------------------- + +// Level-3 sup internal function type. +typedef void (*l3sbxint_t) + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// Level-3 sup thread decorator prototype. +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// Include definitions specific to the method of multithreading. +#include "bao_l3_decor_single.h" +#include "bao_l3_decor_openmp.h" +#include "bao_l3_decor_pthreads.h" + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/gemmd/thread/bao_l3_decor_openmp.c new file mode 100644 index 000000000..1aca8de27 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_openmp.c @@ -0,0 +1,140 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_OPENMP + +// Define a dummy thread entry function, which is needed in the pthreads +// version, so that when building Windows DLLs (with OpenMP enabled or with +// no multithreading) we don't risk having an unresolved symbol. +void* bao_l3_thread_entry( void* data_void ) { return NULL; } + +//#define PRINT_THRINFO + +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // Query the total number of threads from the rntm_t object. + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_pba_rntm_set_pba( rntm ); + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + + + _Pragma( "omp parallel num_threads(n_threads)" ) + { + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Query the thread's id from OpenMP. + const dim_t tid = omp_get_thread_num(); + + // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // NOTE: This calls the same function used for the conventional/large + // code path. + bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.h b/addon/gemmd/thread/bao_l3_decor_openmp.h new file mode 100644 index 000000000..9c956d7c3 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_openmp.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_OPENMP_H +#define BLIS_SBX_L3_DECOR_OPENMP_H + +// Definitions specific to situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#endif + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.c b/addon/gemmd/thread/bao_l3_decor_pthreads.c new file mode 100644 index 000000000..587b8400f --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_pthreads.c @@ -0,0 +1,220 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS + +// A data structure to assist in passing operands to additional threads. +typedef struct thread_data +{ + l3sbxint_t func; + opid_t family; + obj_t* alpha; + obj_t* a; + obj_t* d; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + rntm_t* rntm; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; +} thread_data_t; + +// Entry point function for additional threads. +void* bao_l3_thread_entry( void* data_void ) +{ + thread_data_t* data = data_void; + + l3sbxint_t func = data->func; + opid_t family = data->family; + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* d = data->d; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + dim_t tid = data->tid; + array_t* array = data->array; + thrcomm_t* gl_comm = data->gl_comm; + + ( void )family; + + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + + return NULL; +} + +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + err_t r_val; + + // Query the total number of threads from the context. + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_pba_rntm_set_pba( rntm ); + + // Allocate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) + { + // Set up thread data for additional threads (beyond thread 0). + datas[tid].func = func; + datas[tid].family = family; + datas[tid].alpha = alpha; + datas[tid].a = a; + datas[tid].d = d; + datas[tid].b = b; + datas[tid].beta = beta; + datas[tid].c = c; + datas[tid].cntx = cntx; + datas[tid].rntm = rntm; + datas[tid].tid = tid; + datas[tid].gl_comm = gl_comm; + datas[tid].array = array; + + // Spawn additional threads for ids greater than 1. + if ( tid != 0 ) + bli_pthread_create( &pthreads[tid], NULL, &bao_l3_thread_entry, &datas[tid] ); + else + bao_l3_thread_entry( ( void* )(&datas[0]) ); + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). + + // Thread 0 waits for additional threads to finish. + for ( dim_t tid = 1; tid < n_threads; tid++ ) + { + bli_pthread_join( pthreads[tid], NULL ); + } + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( pthreads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( datas ); +} + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.h b/addon/gemmd/thread/bao_l3_decor_pthreads.h new file mode 100644 index 000000000..69adec45e --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_pthreads.h @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H +#define BLIS_SBX_L3_DECOR_PTHREADS_H + +// Definitions specific to situations when POSIX multithreading is enabled. +#ifdef BLIS_ENABLE_PTHREADS + +// Thread entry point prototype. +void* bao_l3_thread_entry( void* data_void ); + +#endif + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_single.c b/addon/gemmd/thread/bao_l3_decor_single.c new file mode 100644 index 000000000..d60891d65 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_single.c @@ -0,0 +1,143 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifndef BLIS_ENABLE_MULTITHREADING + +#define SKIP_THRINFO_TREE + +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + //pack_t schema_a, + //pack_t schema_b, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // For sequential execution, we use only one thread. + const dim_t n_threads = 1; + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. + bli_pba_rntm_set_pba( rntm ); + +#ifndef SKIP_THRINFO_TREE + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); +#endif + + + { + // NOTE: We don't need to create another copy of the rntm_t since + // it was already copied in one of the high-level oapi functions. + rntm_t* restrict rntm_p = rntm; + + // There is only one thread id (for the thief thread). + const dim_t tid = 0; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + // NOTE: This is commented out because, in the single-threaded case, + // this is redundant since it's already been done above. + //bli_sba_rntm_set_pool( tid, array, rntm_p ); + +#ifndef SKIP_THRINFO_TREE + thrinfo_t* thread = NULL; + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); +#else + // This optimization allows us to use one of the global thrinfo_t + // objects for single-threaded execution rather than grow one from + // scratch. The key is that bli_thrinfo_sup_grow(), which is called + // from within the variants, will immediately return if it detects + // that the thrinfo_t* passed into it is either + // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. + thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; + + ( void )tid; +#endif + + func + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + +#ifndef SKIP_THRINFO_TREE + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); +#endif + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_single.h b/addon/gemmd/thread/bao_l3_decor_single.h new file mode 100644 index 000000000..211a43a89 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_single.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_SINGLE_H +#define BLIS_SBX_L3_DECOR_SINGLE_H + +// Definitions specific to situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +#endif + +#endif + diff --git a/build/bli_addon.h.in b/build/bli_addon.h.in new file mode 100644 index 000000000..36a8e29bd --- /dev/null +++ b/build/bli_addon.h.in @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_ADDON_H +#define BLIS_ADDON_H + +#if @enable_addons@ +#define BLIS_ENABLE_ADDONS +#else +#define BLIS_DISABLE_ADDONS +#endif + +// Enabled addons +@addon_list_includes@ + +#endif diff --git a/build/config.mk.in b/build/config.mk.in index 1032ce8e7..79ecea653 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -190,6 +190,10 @@ MK_ENABLE_CBLAS := @enable_cblas@ # Whether libblis will depend on libmemkind for certain memory allocations. MK_ENABLE_MEMKIND := @enable_memkind@ +# The names of the addons to include when building BLIS. If empty, no addons +# will be included. +ADDON_LIST := @addon_list@ + # The name of a sandbox defining an alternative gemm implementation. If empty, # no sandbox will be used and the conventional gemm implementation will remain # enabled. diff --git a/common.mk b/common.mk index 90c3da83f..5f2d30c9b 100644 --- a/common.mk +++ b/common.mk @@ -152,18 +152,35 @@ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ # When compiling sandboxes, we use flags similar to those of general framework # source. This ensures that the same code can be linked and run across various -# sub-configurations. (If we switch to using refkern/kernel flags, we should -# prevent enabling sandboxes for umbrella families by verifying that -# config_list == config_name if --enable-sandbox is given.) +# sub-configurations. +get-addon-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cflags-for,$(1)) \ + $(CADDONINCFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) +get-addon-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + $(CADDONINCFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) + +# When compiling sandboxes, we use flags similar to those of general framework +# source. This ensures that the same code can be linked and run across various +# sub-configurations. (NOTE: If we ever switch to using refkernel or kernel +# flags, we should prevent enabling sandboxes for umbrella families by verifying +# that config_list == config_name if --enable-sandbox is given. THIS ALSO +# APPLIES TO ADDONS ABOVE.) get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ - $(CSBOXINCFLAGS) \ + $(CSANDINCFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cxxflags-for,$(1)) \ - $(CSBOXINCFLAGS) \ + $(CSANDINCFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) @@ -188,6 +205,8 @@ get-refkern-text-for = "('$(1)' CFLAGS for ref. kernels)" get-config-text-for = "('$(1)' CFLAGS for config code)" get-frame-text-for = "('$(1)' CFLAGS for framework code)" get-kernel-text-for = "('$(1)' CFLAGS for kernels)" +get-addon-c99text-for = "('$(1)' CFLAGS for addons)" +get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)" get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)" get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" @@ -202,6 +221,10 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),))) files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f)))) +# Define a function that removes duplicate strings *without* using the sort +# function. +rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1))) + # # --- Include makefile configuration file -------------------------------------- @@ -286,6 +309,7 @@ CONFIG_DIR := config FRAME_DIR := frame REFKERN_DIR := ref_kernels KERNELS_DIR := kernels +ADDON_DIR := addon SANDBOX_DIR := sandbox OBJ_DIR := obj LIB_DIR := lib @@ -302,11 +326,13 @@ REFNM := ref # Source suffixes. CONFIG_SRC_SUFS := c - KERNELS_SRC_SUFS := c s S - FRAME_SRC_SUFS := c +ADDON_C99_SUFS := c +ADDON_CXX_SUFS := cc cpp cxx +ADDON_SRC_SUFS := $(ADDON_C99_SUFS) $(ADDON_CXX_SUFS) + SANDBOX_C99_SUFS := c SANDBOX_CXX_SUFS := cc cpp cxx SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS) @@ -314,15 +340,21 @@ SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS) # Header suffixes. FRAME_HDR_SUFS := h +ADDON_H99_SUFS := h +ADDON_HXX_SUFS := hh hpp hxx +ADDON_HDR_SUFS := $(ADDON_H99_SUFS) $(ADDON_HXX_SUFS) + SANDBOX_H99_SUFS := h SANDBOX_HXX_SUFS := hh hpp hxx SANDBOX_HDR_SUFS := $(SANDBOX_H99_SUFS) $(SANDBOX_HXX_SUFS) # Combine all header suffixes and remove duplicates via sort(). ALL_HDR_SUFS := $(sort $(FRAME_HDR_SUFS) \ + $(ADDON_HDR_SUFS) \ $(SANDBOX_HDR_SUFS) ) ALL_H99_SUFS := $(sort $(FRAME_HDR_SUFS) \ + $(ADDON_HDR_SUFS) \ $(SANDBOX_H99_SUFS) ) # The names of scripts that check output from the BLAS test drivers and @@ -349,11 +381,13 @@ SHELL := bash # Construct paths to the four primary directories of source code: # the config directory, general framework code, reference kernel code, -# and optimized kernel code. +# and optimized kernel code. Also process paths for addon and sandbox +# directories. CONFIG_PATH := $(DIST_PATH)/$(CONFIG_DIR) FRAME_PATH := $(DIST_PATH)/$(FRAME_DIR) REFKERN_PATH := $(DIST_PATH)/$(REFKERN_DIR) KERNELS_PATH := $(DIST_PATH)/$(KERNELS_DIR) +ADDON_PATH := $(DIST_PATH)/$(ADDON_DIR) SANDBOX_PATH := $(DIST_PATH)/$(SANDBOX_DIR) # Construct paths to some optional C++ template headers contributed by AMD. @@ -367,6 +401,7 @@ CONFIG_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(CONFIG_DIR) FRAME_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(FRAME_DIR) REFKERN_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(REFKERN_DIR) KERNELS_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(KERNELS_DIR) +ADDON_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(ADDON_DIR) SANDBOX_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR) @@ -855,6 +890,7 @@ MK_CONFIG_SRC := MK_KERNELS_SRC := MK_REFKERN_SRC := MK_FRAME_SRC := +MK_ADDON_SRC := MK_SANDBOX_SRC := # -- config -- @@ -905,6 +941,24 @@ PARENT_PATH := $(OBJ_DIR)/$(CONFIG_NAME) -include $(addsuffix /$(FRAGMENT_MK), $(REFKERN_FRAG_PATH)) -include $(addsuffix /$(FRAGMENT_MK), $(FRAME_FRAG_PATH)) +# -- addon -- + +# Construct paths to each addon. +# NOTE: If $(ADDON_LIST) is empty (because no addon was enabled at configure- +# time) then $(ADDON_PATHS) will also be empty, which will cause no fragments +# to be included. +ADDON_PATHS := $(addprefix $(ADDON_FRAG_PATH)/, $(ADDON_LIST)) + +# This variable is used by the include statements as they recursively include +# one another. For the 'addons' directory, we initialize it to that directory +# in preparation to include the fragments in the configuration sub-directory. +PARENT_SRC_PATH := $(ADDON_PATH) +PARENT_PATH := $(ADDON_FRAG_PATH) + +# Recursively include the makefile fragments in each of the addons sub- +# directories. +-include $(addsuffix /$(FRAGMENT_MK), $(ADDON_PATHS)) + # -- sandbox -- # Construct paths to each sandbox. (At present, there can be only one.) @@ -922,6 +976,8 @@ PARENT_PATH := $(SANDBOX_FRAG_PATH) # Recursively include the makefile fragments in the sandbox sub-directory. -include $(addsuffix /$(FRAGMENT_MK), $(SANDBOX_PATHS)) +# -- post-processing -- + # Create a list of the makefile fragments using the variable into which each # of the above include statements accumulated their directory paths. MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS)) @@ -940,14 +996,14 @@ endif # # Define a function that will expand all of the directory paths given in $(1) -# to actual filepaths using the list of suffixes provided $(2). +# to actual filepaths using the list of suffixes provided in $(2). get-filepaths = $(strip $(foreach path, $(1), \ $(foreach suf, $(2), \ $(wildcard $(path)/*.$(suf)) \ ) ) ) # Define a function that will expand all of the directory paths given in $(1) -# to actual filepaths using the list of suffixes provided $(2), taking only +# to actual filepaths using the list of suffixes provided in $(2), taking only # the first expansion from each directory with at least one file matching # the current suffix. Finally, strip the filenames from all resulting files, # returning only the directory paths. @@ -957,20 +1013,29 @@ get-dirpaths = $(dir $(foreach path, $(1), \ $(wildcard $(path)/*.$(suf)) \ ) ) ) ) -# We'll use two directory lists. The first is a list of all of the directories -# in which makefile fragments were generated (plus the current directory). The -# second is the subset of the first that begins with the sandbox root path. +# We'll use three directory lists. The first is a list of all of the directories +# in which makefile fragments were generated, plus the current directory. (The +# current directory is needed so we include bli_config.h and bli_addon.h in the +# processing of header files.) The second and third are subsets of the first +# that begins with the addon and sandbox root paths, respectively. ALLFRAG_DIR_PATHS := . $(FRAGMENT_DIR_PATHS) +ADDON_DIR_PATHS := $(filter $(ADDON_PATH)/%,$(ALLFRAG_DIR_PATHS)) SANDBOX_DIR_PATHS := $(filter $(SANDBOX_PATH)/%,$(ALLFRAG_DIR_PATHS)) ALL_H99_FILES := $(call get-filepaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) -FRAME_H99_FILES := $(filter-out $(SANDBOX_PATH)/%,$(ALL_H99_FILES)) +FRAME_H99_FILES := $(filter-out $(ADDON_PATH)/%, \ + $(filter-out $(SANDBOX_PATH)/%, \ + $(ALL_H99_FILES) \ + ) ) -ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) +ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) -SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS)) -SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS)) +ADDON_H99_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_H99_SUFS)) +ADDON_HXX_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_HXX_SUFS)) +ADDON_HDR_DIRPATHS := $(call get-dirpaths,$(ADDON_DIR_PATHS),$(ALL_HDR_SUFS)) +SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS)) +SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS)) SANDBOX_HDR_DIRPATHS := $(call get-dirpaths,$(SANDBOX_DIR_PATHS),$(ALL_HDR_SUFS)) @@ -1025,8 +1090,8 @@ CBLAS_H_FLAT := $(BASE_INC_PATH)/$(CBLAS_H) # # Obtain a list of header files #included inside of the bli_cntx_ref.c file. -# Paths to these files will be needed when compiling with the monolithic -# header. +# Due to the way that bli_cntx_ref.c uses headers and macros, paths to these +# files will be needed when compiling bli_cntx_ref.c with the monolithic header. ifeq ($(strip $(SHARE_PATH)),.) REF_KER_SRC := $(DIST_PATH)/$(REFKERN_DIR)/bli_cntx_ref.c REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H)) @@ -1034,9 +1099,10 @@ endif # Match each header found above with the path to that header, and then strip # leading, trailing, and internal whitespace. -REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \ - $(dir $(filter %/$(header), \ - $(FRAME_H99_FILES))))) +REF_KER_H_PATHS := $(call rm-dups,$(strip \ + $(foreach header, $(REF_KER_HEADERS), \ + $(dir $(filter %/$(header), \ + $(FRAME_H99_FILES)))))) # Add -I to each header path so we can specify our include search paths to the # C compiler. Then add frame/include since it's needed when compiling source @@ -1056,17 +1122,22 @@ ifeq ($(MK_ENABLE_CBLAS),yes) CINCFLAGS += -I$(CBLAS_H_DIRPATH) endif +# Obtain a list of header paths in the configured addons. Then add -I to each +# header path. +CADDONINCFLAGS := $(strip $(patsubst %, -I%, $(ADDON_HDR_DIRPATHS))) + # Obtain a list of header paths in the configured sandbox. Then add -I to each # header path. -CSBOXINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS))) +CSANDINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS))) # # --- BLIS configuration header definitions ------------------------------------ # -# This file was created by configure, but we need to define it here so we can -# remove it as part of the clean targets. +# These files were created by configure, but we need to define them here so we +# can remove them as part of the clean targets. +BLIS_ADDON_H := ./bli_addon.h BLIS_CONFIG_H := ./bli_config.h diff --git a/configure b/configure index 447b0791e..1abe7610e 100755 --- a/configure +++ b/configure @@ -270,6 +270,15 @@ print_usage() echo " \"small\" depends on thresholds that may vary by sub-" echo " configuration." echo " " + echo " -a NAME --enable-addon=NAME" + echo " " + echo " Enable the code provided by an addon. An addon consists" + echo " of a separate directory of code that provides additional" + echo " APIs, implementations, and/or operations that would" + echo " otherwise not be present within a build of BLIS. This" + echo " option may be used multiple times to specify the inclusion" + echo " of multiple addons. By default, no addons are enabled." + echo " " echo " -s NAME --enable-sandbox=NAME" echo " " echo " Enable a separate sandbox implementation of gemm. This" @@ -973,6 +982,18 @@ canonicalize_ws() echo "${str}" } +rm_duplicate_words_simple() +{ + local str revstr revres res + + str="$1" + + # Remote duplicates, keeping the first occurrence. + res=$(echo "${str}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}{printf("\n")}') + + echo "${res}" +} + rm_duplicate_words() { local str revstr revres res @@ -2069,6 +2090,13 @@ main() bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}" bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}" + # The names/paths for the template bli_addon.h.in and its instantiated + # counterpart. + bli_addon_h_in='bli_addon.h.in' + bli_addon_h_out='bli_addon.h' + bli_addon_h_in_path="${build_dirpath}/${bli_addon_h_in}" + bli_addon_h_out_path="${cur_dirpath}/${bli_addon_h_out}" + # Path to 'mirror-tree.sh' script. mirror_tree_sh="${build_dirpath}/mirror-tree.sh" @@ -2092,6 +2120,10 @@ main() frame_dir='frame' frame_dirpath="${dist_path}/${frame_dir}" + # The names of the addons. + addon_dir='addon' + addon_dirpath="${dist_path}/${addon_dir}" + # The name of the sandbox directory. sandbox_dir='sandbox' sandbox_dirpath="${dist_path}/${sandbox_dir}" @@ -2199,6 +2231,10 @@ main() force_version='no' complex_return='default' + # The addon flag and names. + addon_flag='' + addon_list='' + # The sandbox flag and name. sandbox_flag='' sandbox='' @@ -2243,7 +2279,7 @@ main() # Process our command line options. unset OPTIND - while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do + while getopts ":hp:d:e:a:s:t:r:qci:b:-:" opt; do case $opt in -) case "$OPTARG" in @@ -2350,12 +2386,21 @@ main() disable-mem-tracing) enable_mem_tracing='no' ;; + enable-addon=*) + addon_flag=1 + addon_name=${OPTARG#*=} + # Append the addon name to the list. + addon_list="${addon_list} ${addon_name}" + ;; + disable-addon) + addon_flag='' + ;; enable-sandbox=*) sandbox_flag=1 sandbox=${OPTARG#*=} ;; disable-sandbox) - sandbox_flag=0 + sandbox_flag='' ;; int-size=*) int_type_size=${OPTARG#*=} @@ -2432,6 +2477,12 @@ main() e) export_shared=$OPTARG ;; + a) + addon_flag=1 + addon_name=$OPTARG + # Append the addon name to the list. + addon_list="${addon_list} ${addon_name}" + ;; s) sandbox_flag=1 sandbox=$OPTARG @@ -3279,6 +3330,34 @@ main() exit 1 fi + # Check if addons were given. + if [ -n "${addon_flag}" ]; then + + # Remove duplicates in the addon list, if they exist. + addon_list=$(rm_duplicate_words_simple "${addon_list}") + + echo "${script_name}: configuring with addons:" + + for addon in ${addon_list}; do + + echo "${script_name}: ${addon_dir}/${addon}" + + addon_fullpath="${addon_dirpath}/${addon}" + + if [ ! -d "${addon_fullpath}" ]; then + echo "${script_name}: requested addon sub-directory does not exist! Cannot continue." + echo "${script_name}: *** Please verify addon existence and name." + exit 1 + fi + done + + enable_addons_01=1 + else + echo "${script_name}: configuring with no addons." + + enable_addons_01=0 + fi + # Check if a sandbox was given. if [ -n "${sandbox_flag}" ]; then @@ -3417,6 +3496,15 @@ main() kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n" done + # Create a list of #includes, one for each addon in addon_list. + addon_list_includes="" + for addon in ${addon_list}; do + + # Create a #define and add it to the running list. + addon_header="\"${addon}.h\"" + addon_list_includes="${addon_list_includes}#include ${addon_header}\n" + done + # -- Determine whether we are performing an out-of-tree build -------------- @@ -3444,7 +3532,7 @@ main() fi - # -- Instantiate config.mk, bli_config.h files from templates -------------- + # -- Instantiate config.mk file from template ------------------------------ # Begin substituting information into the config_mk_in file, outputting # to config_mk_out. @@ -3496,9 +3584,11 @@ main() | sed -e "s/@enable_cblas@/${enable_cblas}/g" \ | sed -e "s/@enable_memkind@/${enable_memkind}/g" \ | sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \ + | sed -e "s/@addon_list@/${addon_list}/g" \ | sed -e "s/@sandbox@/${sandbox}/g" \ > "${config_mk_out_path}" + # -- Instantiate bli_config.h file from template --------------------------- # Begin substituting information into the bli_config_h_in file, outputting # to bli_config_h_out. NOTE: We use perl instead of sed because the version @@ -3533,6 +3623,17 @@ main() | sed -e "s/@complex_return_intel@/${complex_return_intel01}/g" \ > "${bli_config_h_out_path}" + # -- Instantiate bli_addon.h file from template ---------------------------- + + # Begin substituting information into the bli_addon_h_in file, outputting + # to bli_addon_h_out. NOTE: We use perl instead of sed because the version + # of sed used on OS X is old and does not handle the '\n' character + # intuitively, which was used when constructing ${addon_list_includes}. + echo "${script_name}: creating ${bli_addon_h_out_path} from ${bli_addon_h_in_path}" + cat "${bli_addon_h_in_path}" \ + | perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" \ + | sed -e "s/@enable_addons@/${enable_addons_01}/g" \ + > "${bli_addon_h_out_path}" # -- Create top-level object directories ----------------------------------- @@ -3545,7 +3646,6 @@ main() obj_config_dirpath="${base_obj_dirpath}/${config_dir}" - #echo "${script_name}: creating ${obj_config_dirpath}" mkdir -p ${obj_config_dirpath} for conf in ${config_list}; do echo "${script_name}: creating ${obj_config_dirpath}/${conf}" @@ -3555,7 +3655,6 @@ main() obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}" - #echo "${script_name}: creating ${obj_kernels_dirpath}" mkdir -p ${obj_kernels_dirpath} for kern in ${kernel_list}; do echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}" @@ -3565,7 +3664,6 @@ main() obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}" - #echo "${script_name}: creating ${obj_refkern_dirpath}" mkdir -p ${obj_refkern_dirpath} for conf in ${config_list}; do echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}" @@ -3578,6 +3676,18 @@ main() echo "${script_name}: creating ${obj_frame_dirpath}" mkdir -p ${obj_frame_dirpath} + + if [ -n "${addon_flag}" ]; then + + obj_addon_dirpath="${base_obj_dirpath}/${addon_dir}" + + for addon in ${addon_list}; do + echo "${script_name}: creating ${obj_addon_dirpath}/${addon}" + mkdir -p ${obj_addon_dirpath}/${addon} + done + fi + + if [ -n "${sandbox_flag}" ]; then obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}" @@ -3605,6 +3715,7 @@ main() echo "${script_name}: creating ${base_lib_dirpath}" mkdir -p ${base_lib_dirpath} + # Create include directory (if it does not already exist). base_include_dirpath="${include_dirpath}/${config_name}" @@ -3659,6 +3770,16 @@ main() echo "${script_name}: mirroring ${frame_dirpath} to ${obj_frame_dirpath}" ${mirror_tree_sh} ${frame_dirpath} ${obj_frame_dirpath} + # Mirror the chosen addon source tree to its object sub-directory. + if [ -n "${addon_flag}" ]; then + + for addon in ${addon_list}; do + + echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}" + ${mirror_tree_sh} "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}" + done + fi + # Mirror the chosen sandbox source tree to its object sub-directory. if [ -n "${sandbox_flag}" ]; then @@ -3745,6 +3866,25 @@ main() ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list + # Generate makefile fragments in the addon sub-directory. + if [ -n "${addon_flag}" ]; then + + for addon in ${addon_list}; do + + echo "${script_name}: creating makefile fragments in ${obj_addon_dirpath}/${addon}" + ${gen_make_frags_sh} \ + -h -r -v0 \ + -o ${script_name} \ + -p 'ADDON' \ + ${addon_dirpath}/${addon} \ + ${obj_addon_dirpath}/${addon} \ + ${gen_make_frags_dirpath}/fragment.mk \ + ${gen_make_frags_dirpath}/suffix_list \ + ${gen_make_frags_dirpath}/ignore_list + done + fi + + # Generate makefile fragments in the sandbox sub-directory. if [ -n "${sandbox_flag}" ]; then diff --git a/docs/Addons.md b/docs/Addons.md new file mode 100644 index 000000000..595cebfa4 --- /dev/null +++ b/docs/Addons.md @@ -0,0 +1,231 @@ +## Contents + +* **[Introduction](Addons.md#introduction)** +* **[Enabling addons](Addons.md#enabling-addons)** +* **[Addon rules](Addons.md#addon-rules)** +* **[Caveats](Addons.md#caveats)** +* **[Known issues](Addons.md#known-issues)** +* **[Conclusion](Addons.md#conclusion)** + + +## Introduction + +This file briefly describes the requirements for building a custom BLIS +*addon*. + +Simply put, an addon in BLIS provides additional APIs, operations, and/or +implementations that may be useful to certain users. An addon can be +thought of as a standalone extension of BLIS that does not depend on any +other addon, although addons may utilize existing functionality or kernels +within the core framework. + +By definition, an addon should *never* provide APIs that conflict with +the interfaces that belong to either the [typed API](BLISTypedAPI.md) or the +[object API](BLISObjectAPI.md). Thus, you'll never have to worry about a +properly constructed (and properly functioning) addon interfering with or +otherwise changing core BLIS functionality. + +How does an addon differ from a [sandbox](Sandboxes.md)? Great question! +Sometimes you want to include additional BLIS-like functionality that does +not relate directly to `gemm` or any other BLIS operation. +(By contrast, a sandbox requires you to implement `gemm` whether you want +to or not.) +Furthermore, you may wish to enable multiple addons simultaneously. +(By contrast, only one sandbox may be enabled at a time.) +Thus, the addon feature provides additional flexibility to some +users in a way that sandboxes cannot, while still providing many of the +conveniences of sandboxes. + +## Enabling an addon + +To enable an existing addon at configure-time, you simply specify it as an +option to `configure`. Either of the following usages are accepted: +``` +$ ./configure --enable-addon=foobar auto +$ ./configure -a foobar auto +``` +Here, we tell `configure` that we want to use the `foobar` addon, which +corresponds to a subdirectory of the `addon` directory named `foobar`. +(Reminder: the `auto` argument is the configuration target and +unrelated to addons.) + +You may also enable multiple addons within the same build of BLIS: +``` +$ ./configure -a foobar -a thing1 -a thing2 auto +``` +Note that the default behavior of `configure` is that no addons are enabled. + +As `configure` runs, you should get output that includes lines +similar to: +``` +configure: configuring with addons: +configure: addon/foobar +configure: addon/thing1 +configure: addon/thing2 +``` +And when you build BLIS, the addon source code will be among the last files to +be compiled: +``` +Compiling obj/haswell/addon/foobar/foobar.o ('haswell' CFLAGS for addons) +Compiling obj/haswell/addon/thing1/thing1.o ('haswell' CFLAGS for addons) +Compiling obj/haswell/addon/thing1/thing1_api.o ('haswell' CFLAGS for addons) +Compiling obj/haswell/addon/thing2/thing2_api.o ('haswell' CFLAGS for addons) +... +``` +That's it! After the BLIS library is built, it will contain your chosen +addons. You can always confirm this by using `nm` to confirm the presence +of your API symbols: +``` +$ nm lib/haswell/libblis.a | grep foobar +foobar.o: +0000000000000000 T foobar +``` + +## Addon rules + +Please follow these guidelines for the best developer experience when +creating addons. + +1. As with sandboxes, you don't need to worry about creating makefiles. The +BLIS build system will take care of this for you. :) By configuring BLIS with +an addon enabled, `make` will scan your addon subdirectory and compile +all of its source code using similar compilation rules as were used for the rest +of the framework. In addition, the compilation command line will automatically +contain one `-I` option for every subdirectory in your addon, +so it doesn't matter where in your addon directory hierarchy you place your +header files -- they will be found! + +2. We recommend that you write your addon in C99. While you *may* use C++11 +to implement your addon, you should provide a C99 wrapper API to your +implementation so that others can interface with it. There is no guarantee +that the end-user will be using a C++11 compiler, and therefore you should +limit the definitions in your addon header to those that are C99 compliant. +If you write your addon in C++11, you must use one of the BLIS-approved file +extensions for your source files (`.cc`, `.cpp`, `.cxx`) and your local +header files (`.hh`, `.hpp`, `.hxx`). +Note that `blis.h` already contains all of its definitions inside of an +`extern "C"` block, so you should be able to `#include "blis.h"` from your +C++11 source code without any issues. + +3. All of your code related to the addon should reside within the named +addon directory, or some subdirectory therein. If your addon requires +new kernels, you should add kernel source code to an appropriate +microarchitecture-specific subdirectory within the top-level `kernels` +directory so that they are compiled with the correct +microarchitecture-specific optimization flags. + +4. If your addon is named `foobar`, the BLIS build system will expect to +find a header called `foobar.h` somewhere in the `addon/foobar` directory +(or one of its subdirectories). This `foobar.h` header will automatically +be inlined into the monolithic `blis.h` header that is produced by the +BLIS build system. `foobar.h` may `#include` other local headers, each of +which will also (recursively) get inlined into `blis.h`. However, you may +choose to omit some local addon headers from `foobar.h.` You might do this, +for example, because those headers define things that are not needed in +order for the end user to call your addon code. + +5. Your addon APIs will always be available within static library builds of +BLIS, but if you want your addon APIs to be exported as public APIs within +*shared* library builds of BLIS, you'll need to annotate the prototypes +accordingly. (BLIS makes its shared library symbols private by default; this +allows us to export only those functions that we consider to be part of the +public APIs.) This annotation can be done by prefixing function prototypes +with the `BLIS_EXPORT_ADDON` macro as follows: +```c +BLIS_EXPORT_ADDON void foobar_calc( void* a, void* b ); +``` + +6. Do not define any symbols in your addon that conflict with any symbols within +the core framework. For example, don't define a function called `bli_copym()` +in your addon since that function is already defined within BLIS. + +7. Do not define any symbols in your addon that conflict with any symbols within +the C99 standard libraries/headers. For example, don't define a function called +`printf()` since that function is already defined within the C99 standard library. + +8. *Try* to not define any symbols in your addon that conflict with symbols in any +other addon, unless your addon is meant to serve as an alternative to the +conflicting addon, in which case conflicting symbol names is okay (since you +will presumably never build with both addons enabled). + +9. When choosing names for your addon files, avoid source filenames that already +exist within BLIS. For example, don't name one of your files `bli_obj.c` +since that file would compile into `bli_obj.o`, which will have already been +placed into the library by the build system. + +10. Similarly, avoid header filenames that already exist within BLIS or C99. +For example, don't name one of your header files `bli_obj.h` since that file +already exists in BLIS. Also, don't name one of your header files `math.h` +since that name would conflict with the `math.h` defined by C99. (This also +means you shouldn't name your addon `math` since normally that name would +require that you provide a `math.h` header inside the addon directory.) + +If you follow these rules, you will be much more likely to have a pleasant +experience integrating your BLIS addon into the larger framework. + +## Caveats + +Notice that the BLIS addons are limited in what they can accomplish. Generally +speaking, addons cannot change existing implementations within BLIS. Instead, +addons aim to provide a way to quickly augment BLIS with additional bundles of +code that extend BLIS's set of functionality in some interesting way. If you +want to define new BLAS-like functions, but don't know where to start, creating +a new addon is an appropriate place to start experimenting. If you want to +change or refactor existing BLIS code, an addon is probably not suited for your +needs. + +Another important limitation is the fact that the build system currently uses +"framework `CFLAGS`" when compiling the addon source files. These are the same +`CFLAGS` used when compiling general framework source code, +``` +# Example framework CFLAGS used by 'haswell' sub-configuration +-O2 -Wall -Wno-unused-function -Wfatal-errors -fPIC -std=c99 +-D_POSIX_C_SOURCE=200112L -Iinclude/haswell -I./frame/3/ +-I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include +-DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden +``` +which are likely more general-purpose than the `CFLAGS` used for, say, +optimized kernels or even reference kernels: +``` +# Example optimized kernel CFLAGS used by 'haswell' sub-configuration +-O3 -fomit-frame-pointer -mavx2 -mfma -mfpmath=sse -march=haswell -Wall +-Wno-unused-function -Wfatal-errors -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L +-Iinclude/haswell -I./frame/3/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/ +-I./frame/include -DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden +``` +(To see precisely which flags are being employed for any given file, enable +verbosity at compile-time via `make V=1`.) Compiling addons with these more +versatile `CFLAGS` compiler options means that we only need to compile one +instance of each addon source file, even when targeting multiple +configurations (for example, via `./configure x86_64`). However, it also means +that addons are not ideal for microkernels, as they sometimes need additional +compiler flags in order to +yield the highest performance. If you have a new microkernel you would like to +use within an addon, you can always develop it within that addon. However, +once it is stable and ready for use by others, it's best to move the kernel(s) +to the appropriate microarchitecture-specific subdirectory of the `kernels` +directory the kernel(s). This will allow the kernel to be compiled with the +appropriate microarchitecture-specific compiler flags. +Please see the +[Configuration Guide](ConfigurationHowTo) +for more details, and when in doubt, please don't be shy about seeking +guidance from BLIS developers by opening a +[new issue](https://github.com/flame/blis/issues) or sending a message to the +[blis-devel](http://groups.google.com/d/forum/blis-devel) mailing list. + +Notwithstanding these limitations, hopefully you still find BLIS addons +useful! + +## Known issues + +* None yet. + +## Conclusion + +If you encounter any problems, please open +a new [issue on GitHub](https://github.com/flame/blis/issues). + +If you are unsure about how something works, you can still open an issue. Or, you +can send a message to +[blis-devel](https://groups.google.com/d/forum/blis-devel) mailing list. + diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 86f23df6e..5a4c8a15d 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -231,8 +231,9 @@ #endif #endif -#define BLIS_EXPORT_BLIS BLIS_EXPORT -#define BLIS_EXPORT_BLAS BLIS_EXPORT +#define BLIS_EXPORT_BLIS BLIS_EXPORT +#define BLIS_EXPORT_BLAS BLIS_EXPORT +#define BLIS_EXPORT_ADDON BLIS_EXPORT // -- STATIC INLINE FUNCTIONS -------------------------------------------------- diff --git a/frame/include/blis.h b/frame/include/blis.h index b374e8539..98ebee878 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -196,6 +196,14 @@ extern "C" { #include "bli_util.h" +// -- addon definitions -- + +// NOTE: These definitions should not be included much earlier since an addon +// may wish to utilize other types and definitions provided by BLIS. + +#include "bli_addon.h" + + // -- sandbox implementation -- #include "bli_sbox.h"