diff --git a/LICENSE b/LICENSE index 5a5aa21d9..aac8ba88e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,8 +1,6 @@ Copyright (C) 2017, Advanced Micro Devices, Inc. -Copyright (C) 2014, The University of Texas at Austin - Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/Makefile b/Makefile index d74eba889..6a1bab97f 100644 --- a/Makefile +++ b/Makefile @@ -85,9 +85,6 @@ TESTSUITE_CONF_GEN := input.general TESTSUITE_CONF_OPS := input.operations TESTSUITE_OUT_FILE := output.testsuite -# The name of the file where the version string is stored. -VERSION_FILE := version - # The name of the "special" directories, which contain source code that # use non-standard compiler flags. NOOPT_DIR := noopt @@ -141,7 +138,6 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME) # Construct the architecture-version string, which will be used to name the # library upon installation. -VERSION := $(shell cat $(DIST_PATH)/$(VERSION_FILE)) VERS_CONF := $(VERSION)-$(CONFIG_NAME) # --- Library names --- diff --git a/README.md b/README.md index b0ba6d345..7f35e3cad 100644 --- a/README.md +++ b/README.md @@ -313,7 +313,7 @@ This project and its associated research was partially sponsored by grants from [Microsoft](http://www.microsoft.com/), [Intel](http://www.intel.com/), [Texas Instruments](http://www.ti.com/), and [AMD](http://www.amd.com/), as well as grants from the [National Science Foundation](http://www.nsf.gov/) (Awards -CCF-0917167 ACI-1148125/1340293, and CCF-1320112). +CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493). _Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of diff --git a/build/config.mk.in b/build/config.mk.in index e7a3f3235..ef2ccfc70 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -36,6 +36,10 @@ ifndef CONFIG_MK_INCLUDED CONFIG_MK_INCLUDED := yes +# The version string. This could be the official string or a custom +# string forced at configure-time. +VERSION := @version@ + # The name of the configuration sub-directory. CONFIG_NAME := @config_name@ diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 9d1b51d0a..8539e1d29 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 6d09af5cc..56dd3074e 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 57c9899a0..07f6792db 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -59,7 +59,7 @@ CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 0546a474f..582354e96 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index f52d1dd67..94808d466 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 053e11cbb..c4c47467e 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 053e11cbb..c4c47467e 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 8d07f2177..eec2f5a56 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 4353d65cf..91f65f811 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -58,7 +58,8 @@ CVECFLAGS := # --- Determine the archiver and related flags --- AR := emar -ARFLAGS := cru +RANLIB := emranlib +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 8c739607a..4a4e6e494 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 104abafe2..a3db40981 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -94,7 +94,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 8fd9fb65a..89ca32929 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 8e7738b44..e82811357 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index b5c3f159c..93cd1f2c8 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 765344f79..f35ffdfff 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index f75b9ec55..89bcca269 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -81,7 +81,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index d91df8b68..7bf48d2a4 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index d98452553..e563d9308 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/configure b/configure index 7aabc5b78..9edfaa98b 100755 --- a/configure +++ b/configure @@ -123,6 +123,12 @@ print_usage() echo " compatibility layer. This automatically enables the" echo " BLAS compatibility layer as well." echo " " + echo " --force-version=STRING" + echo " " + echo " Force configure to use an arbitrary version string" + echo " STRING. This option may be useful when repackaging" + echo " custom versions of BLIS by outside organizations." + echo " " echo " -h, --help Output this information and quit." echo " " echo " Environment Variables:" @@ -232,6 +238,7 @@ main() blas2blis_int_type_size=32 enable_blas2blis='yes' enable_cblas='no' + force_version='no' # The path to the auto-detection script. auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh" @@ -247,14 +254,6 @@ main() dummy_file='_blis_dir_detect.tmp' - # Check whether we need to update the version file. - ${update_version_file_sh} -o "${script_name}" "${version_filepath}" - - - # Query which version of BLIS this is. - version=$(cat ${version_filepath}) - - # Process our command line options. while getopts ":hp:d:t:qi:b:-:" opt; do case $opt in @@ -323,6 +322,9 @@ main() disable-cblas) enable_cblas='no' ;; + force-version=*) + force_version=${OPTARG#*=} + ;; *) print_usage ;; @@ -375,10 +377,27 @@ main() done + # Check whether we need to update the version file. + ${update_version_file_sh} -o "${script_name}" "${version_filepath}" + + + # Query which version of BLIS this is. + version=$(cat ${version_filepath}) + + # Initial message. echo "${script_name}: starting configuration of BLIS ${version}." + # Check if the user requested a custom version string. + if [ "x${force_version}" = "xno" ]; then + echo "${script_name}: configuring with official version string." + else + echo "${script_name}: configuring with custom version string '${force_version}'." + version="${force_version}" + fi + + # Set config_name based on the number of arguments leftover (after command # line option processing). if [ $# = "0" ]; then @@ -574,6 +593,7 @@ main() # to config_mk_out. echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}" cat "${config_mk_in_path}" \ + | sed "s/@version@/${version}/g" \ | sed "s/@config_name@/${config_name}/g" \ | sed "s/@dist_path@/${dist_path_esc}/g" \ | sed "s/@CC@/${cc_esc}/g" \ diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index 149c20320..243a3d062 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addv, BLIS_ADDV_KER ) @@ -70,7 +70,7 @@ GENFRONT( swapv, BLIS_SWAPV_KER ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ @@ -84,7 +84,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) @@ -95,7 +95,7 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -106,7 +106,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpyv, BLIS_AXPYV_KER, addv ) @@ -118,7 +118,7 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ @@ -130,7 +130,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( scal2v, BLIS_SCAL2V_KER, setv, copyv ) diff --git a/frame/1/other/packv/bli_packv_cntl.c b/frame/1/other/packv/bli_packv_cntl.c index 13f90a429..b81a6e5d1 100644 --- a/frame/1/other/packv/bli_packv_cntl.c +++ b/frame/1/other/packv/bli_packv_cntl.c @@ -47,7 +47,7 @@ void bli_packv_cntl_init( void ) void bli_packv_cntl_finalize( void ) { - bli_cntl_obj_free( packv_cntl ); + bli_cntl_free_node( packv_cntl ); } packv_t* bli_packv_cntl_obj_create( impl_t impl_type, @@ -105,7 +105,7 @@ cntl_t* bli_packv_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( BLIS_NO_PART, var_func, diff --git a/frame/1/other/scalv/bli_scalv_cntl.c b/frame/1/other/scalv/bli_scalv_cntl.c index 9edb6162c..c75977fa1 100644 --- a/frame/1/other/scalv/bli_scalv_cntl.c +++ b/frame/1/other/scalv/bli_scalv_cntl.c @@ -44,7 +44,7 @@ void bli_scalv_cntl_init() void bli_scalv_cntl_finalize() { - bli_cntl_obj_free( scalv_cntl ); + bli_cntl_free_node( scalv_cntl ); } diff --git a/frame/1/other/unpackv/bli_unpackv_cntl.c b/frame/1/other/unpackv/bli_unpackv_cntl.c index 1e1ab93fb..52858fc0b 100644 --- a/frame/1/other/unpackv/bli_unpackv_cntl.c +++ b/frame/1/other/unpackv/bli_unpackv_cntl.c @@ -44,7 +44,7 @@ void bli_unpackv_cntl_init() void bli_unpackv_cntl_finalize() { - bli_cntl_obj_free( unpackv_cntl ); + bli_cntl_free_node( unpackv_cntl ); } unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type, diff --git a/frame/1d/bli_l1d_cntx.c b/frame/1d/bli_l1d_cntx.c index 443dc20f7..f22631a5d 100644 --- a/frame/1d/bli_l1d_cntx.c +++ b/frame/1d/bli_l1d_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addd, addv ) diff --git a/frame/1f/bli_l1f_cntx.c b/frame/1f/bli_l1f_cntx.c index 58ca4a07c..8e786f2ed 100644 --- a/frame/1f/bli_l1f_cntx.c +++ b/frame/1f/bli_l1f_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -54,7 +54,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) @@ -65,7 +65,7 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -77,7 +77,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) @@ -88,7 +88,7 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -105,7 +105,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) @@ -116,7 +116,7 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -135,7 +135,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( dotxf, BLIS_DOTXF_KER, dotv, dotxv ) diff --git a/frame/1m/bli_l1m_cntx.c b/frame/1m/bli_l1m_cntx.c index 7eb3dcd4c..d7ede7c91 100644 --- a/frame/1m/bli_l1m_cntx.c +++ b/frame/1m/bli_l1m_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addm, addv ) @@ -66,7 +66,7 @@ GENFRONT( subm, subv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -75,7 +75,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( copym, copyv, setv ) diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 67b01fffb..6effbb522 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_packm_cntl_obj_create +cntl_t* bli_packm_cntl_create_node ( void* var_func, void* packm_var_func, @@ -69,8 +69,9 @@ cntl_t* bli_packm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, params, diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 057a512ed..ab22e8621 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -80,7 +80,7 @@ typedef struct packm_params_s packm_params_t; // ----------------------------------------------------------------------------- -cntl_t* bli_packm_cntl_obj_create +cntl_t* bli_packm_cntl_create_node ( void* var_func, void* packm_var_func, diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 2f4e0b030..75fa24d67 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -41,7 +41,7 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) { - bli_cntx_obj_create( cntx ); + bli_cntx_create( cntx ); // Initialize the context with kernels that may be needed for the // current operation. @@ -57,5 +57,5 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) void bli_packm_cntx_finalize( cntx_t* cntx ) { - bli_cntx_obj_free( cntx ); + bli_cntx_free( cntx ); } diff --git a/frame/1m/scalm/bli_scalm_cntl.c b/frame/1m/scalm/bli_scalm_cntl.c index f6008a9a3..24c12bc9e 100644 --- a/frame/1m/scalm/bli_scalm_cntl.c +++ b/frame/1m/scalm/bli_scalm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_scalm_cntl_obj_create +cntl_t* bli_scalm_cntl_create_node ( void* var_func, cntl_t* sub_node @@ -46,8 +46,9 @@ cntl_t* bli_scalm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, NULL, diff --git a/frame/1m/scalm/bli_scalm_cntl.h b/frame/1m/scalm/bli_scalm_cntl.h index 4029a4f10..d6160dca8 100644 --- a/frame/1m/scalm/bli_scalm_cntl.h +++ b/frame/1m/scalm/bli_scalm_cntl.h @@ -33,7 +33,7 @@ */ -cntl_t* bli_scalm_cntl_obj_create +cntl_t* bli_scalm_cntl_create_node ( void* var_func, cntl_t* sub_node diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 2900cb3b8..852b0c81e 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_unpackm_cntl_obj_create +cntl_t* bli_unpackm_cntl_create_node ( void* var_func, void* unpackm_var_func, @@ -55,8 +55,9 @@ cntl_t* bli_unpackm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, params, diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 82d9727fc..96278d406 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -45,7 +45,7 @@ typedef struct unpackm_params_s unpackm_params_t; // ----------------------------------------------------------------------------- -cntl_t* bli_unpackm_cntl_obj_create +cntl_t* bli_unpackm_cntl_create_node ( void* var_func, void* unpackm_var_func, diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index 2207a0aec..dc5020c8d 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -45,7 +45,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -127,7 +127,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( trmv ) GENFRONT( trsv ) @@ -139,7 +139,7 @@ GENFRONT( trsv ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -159,7 +159,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( ger ) @@ -173,7 +173,7 @@ GENFRONT( syr ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -211,7 +211,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( hemv ) @@ -224,7 +224,7 @@ GENFRONT( symv ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -246,7 +246,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( her2 ) diff --git a/frame/2/gemv/other/bli_gemv_cntl.c b/frame/2/gemv/other/bli_gemv_cntl.c index ecedeaca4..4ccba4ff0 100644 --- a/frame/2/gemv/other/bli_gemv_cntl.c +++ b/frame/2/gemv/other/bli_gemv_cntl.c @@ -152,17 +152,17 @@ void bli_gemv_cntl_init() void bli_gemv_cntl_finalize() { - bli_cntl_obj_free( gemv_cntl_bs_ke_dot ); - bli_cntl_obj_free( gemv_cntl_bs_ke_axpy ); + bli_cntl_free_node( gemv_cntl_bs_ke_dot ); + bli_cntl_free_node( gemv_cntl_bs_ke_axpy ); - bli_cntl_obj_free( gemv_cntl_rp_bs_dot ); - bli_cntl_obj_free( gemv_cntl_rp_bs_axpy ); + bli_cntl_free_node( gemv_cntl_rp_bs_dot ); + bli_cntl_free_node( gemv_cntl_rp_bs_axpy ); - bli_cntl_obj_free( gemv_cntl_cp_bs_dot ); - bli_cntl_obj_free( gemv_cntl_cp_bs_axpy ); + bli_cntl_free_node( gemv_cntl_cp_bs_dot ); + bli_cntl_free_node( gemv_cntl_cp_bs_axpy ); - bli_cntl_obj_free( gemv_cntl_ge_dot ); - bli_cntl_obj_free( gemv_cntl_ge_axpy ); + bli_cntl_free_node( gemv_cntl_ge_dot ); + bli_cntl_free_node( gemv_cntl_ge_axpy ); } diff --git a/frame/2/ger/other/bli_ger_cntl.c b/frame/2/ger/other/bli_ger_cntl.c index 16565ef02..6e35b5f6f 100644 --- a/frame/2/ger/other/bli_ger_cntl.c +++ b/frame/2/ger/other/bli_ger_cntl.c @@ -145,17 +145,17 @@ void bli_ger_cntl_init() void bli_ger_cntl_finalize() { - bli_cntl_obj_free( ger_cntl_bs_ke_row ); - bli_cntl_obj_free( ger_cntl_bs_ke_col ); + bli_cntl_free_node( ger_cntl_bs_ke_row ); + bli_cntl_free_node( ger_cntl_bs_ke_col ); - bli_cntl_obj_free( ger_cntl_rp_bs_row ); - bli_cntl_obj_free( ger_cntl_rp_bs_col ); + bli_cntl_free_node( ger_cntl_rp_bs_row ); + bli_cntl_free_node( ger_cntl_rp_bs_col ); - bli_cntl_obj_free( ger_cntl_cp_bs_row ); - bli_cntl_obj_free( ger_cntl_cp_bs_col ); + bli_cntl_free_node( ger_cntl_cp_bs_row ); + bli_cntl_free_node( ger_cntl_cp_bs_col ); - bli_cntl_obj_free( ger_cntl_ge_row ); - bli_cntl_obj_free( ger_cntl_ge_col ); + bli_cntl_free_node( ger_cntl_ge_row ); + bli_cntl_free_node( ger_cntl_ge_col ); } diff --git a/frame/2/hemv/other/bli_hemv_cntl.c b/frame/2/hemv/other/bli_hemv_cntl.c index 8505f615c..4bed7b012 100644 --- a/frame/2/hemv/other/bli_hemv_cntl.c +++ b/frame/2/hemv/other/bli_hemv_cntl.c @@ -108,10 +108,10 @@ void bli_hemv_cntl_init() void bli_hemv_cntl_finalize() { - bli_cntl_obj_free( hemv_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( hemv_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( hemv_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( hemv_cntl_ge_lcol_urow ); + bli_cntl_free_node( hemv_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( hemv_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( hemv_cntl_ge_lrow_ucol ); + bli_cntl_free_node( hemv_cntl_ge_lcol_urow ); } diff --git a/frame/2/her/other/bli_her_cntl.c b/frame/2/her/other/bli_her_cntl.c index 932306c21..28ed63f12 100644 --- a/frame/2/her/other/bli_her_cntl.c +++ b/frame/2/her/other/bli_her_cntl.c @@ -97,10 +97,10 @@ void bli_her_cntl_init() void bli_her_cntl_finalize() { - bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( her_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( her_cntl_ge_lcol_urow ); + bli_cntl_free_node( her_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( her_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( her_cntl_ge_lrow_ucol ); + bli_cntl_free_node( her_cntl_ge_lcol_urow ); } diff --git a/frame/2/her2/other/bli_her2_cntl.c b/frame/2/her2/other/bli_her2_cntl.c index 4a0f5d0f8..199e74c3c 100644 --- a/frame/2/her2/other/bli_her2_cntl.c +++ b/frame/2/her2/other/bli_her2_cntl.c @@ -101,10 +101,10 @@ void bli_her2_cntl_init() void bli_her2_cntl_finalize() { - bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( her2_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( her2_cntl_ge_lcol_urow ); + bli_cntl_free_node( her2_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( her2_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( her2_cntl_ge_lrow_ucol ); + bli_cntl_free_node( her2_cntl_ge_lcol_urow ); } diff --git a/frame/2/trmv/other/bli_trmv_cntl.c b/frame/2/trmv/other/bli_trmv_cntl.c index 5fbf872aa..fff406365 100644 --- a/frame/2/trmv/other/bli_trmv_cntl.c +++ b/frame/2/trmv/other/bli_trmv_cntl.c @@ -98,10 +98,10 @@ void bli_trmv_cntl_init() void bli_trmv_cntl_finalize() { - bli_cntl_obj_free( trmv_cntl_bs_ke_nrow_tcol ); - bli_cntl_obj_free( trmv_cntl_bs_ke_ncol_trow ); - bli_cntl_obj_free( trmv_cntl_ge_nrow_tcol ); - bli_cntl_obj_free( trmv_cntl_ge_ncol_trow ); + bli_cntl_free_node( trmv_cntl_bs_ke_nrow_tcol ); + bli_cntl_free_node( trmv_cntl_bs_ke_ncol_trow ); + bli_cntl_free_node( trmv_cntl_ge_nrow_tcol ); + bli_cntl_free_node( trmv_cntl_ge_ncol_trow ); } diff --git a/frame/2/trsv/other/bli_trsv_cntl.c b/frame/2/trsv/other/bli_trsv_cntl.c index 71de48d3c..9eedb5a9f 100644 --- a/frame/2/trsv/other/bli_trsv_cntl.c +++ b/frame/2/trsv/other/bli_trsv_cntl.c @@ -101,10 +101,10 @@ void bli_trsv_cntl_init() void bli_trsv_cntl_finalize() { - bli_cntl_obj_free( trsv_cntl_bs_ke_nrow_tcol ); - bli_cntl_obj_free( trsv_cntl_bs_ke_ncol_trow ); - bli_cntl_obj_free( trsv_cntl_ge_nrow_tcol ); - bli_cntl_obj_free( trsv_cntl_ge_ncol_trow ); + bli_cntl_free_node( trsv_cntl_bs_ke_nrow_tcol ); + bli_cntl_free_node( trsv_cntl_bs_ke_ncol_trow ); + bli_cntl_free_node( trsv_cntl_ge_nrow_tcol ); + bli_cntl_free_node( trsv_cntl_ge_ncol_trow ); } diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 630cf03a5..d25f5f924 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -43,10 +43,11 @@ dim_t bli_l3_determine_kc obj_t* a, obj_t* b, bszid_t bszid, - cntx_t* cntx + cntx_t* cntx, + cntl_t* cntl ) { - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 8f9f7ad80..02250efc0 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -32,6 +32,18 @@ */ +dim_t bli_l3_determine_kc + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* a, + obj_t* b, + bszid_t bszid, + cntx_t* cntx, + cntl_t* cntl + ); + #undef GENPROT #define GENPROT( opname ) \ @@ -47,8 +59,6 @@ dim_t PASTEMAC0(opname) \ cntx_t* cntx \ ); -GENPROT( l3_determine_kc ) - GENPROT( gemm_determine_kc ) GENPROT( herk_determine_kc ) GENPROT( trmm_determine_kc ) diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 4fe3fe7f5..db821b811 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -37,10 +37,10 @@ void bli_l3_cntl_create_if ( + opid_t family, obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t** cntl_use ) @@ -49,8 +49,6 @@ void bli_l3_cntl_create_if // tree as a function of the operation family. if ( cntl_orig == NULL ) { - opid_t family = bli_cntx_get_family( cntx ); - if ( family == BLIS_GEMM || family == BLIS_HERK || family == BLIS_TRMM ) @@ -73,6 +71,10 @@ void bli_l3_cntl_create_if // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); + + // Recursively set the family fields of the newly copied control tree + // nodes. + bli_cntl_mark_family( family, *cntl_use ); } } @@ -81,7 +83,6 @@ void bli_l3_cntl_free_if obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t* cntl_use, thrinfo_t* thread @@ -91,7 +92,7 @@ void bli_l3_cntl_free_if // been created, so we now must free it. if ( cntl_orig == NULL ) { - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl_use ); if ( family == BLIS_GEMM || family == BLIS_HERK || diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h index dc0aeb869..3bdd8b43f 100644 --- a/frame/3/bli_l3_cntl.h +++ b/frame/3/bli_l3_cntl.h @@ -39,10 +39,10 @@ void bli_l3_cntl_create_if ( + opid_t family, obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t** cntl_use ); @@ -52,7 +52,6 @@ void bli_l3_cntl_free_if obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t* cntl_use, thrinfo_t* thread diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 161e68160..a8441fa79 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -41,7 +41,7 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -76,7 +76,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx ) void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 993501541..f1c661007 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -39,11 +39,11 @@ dir_t bli_l3_direct obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ) { // Query the operation family. - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 7b88ba51f..021dfde74 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -37,7 +37,7 @@ dir_t bli_l3_direct obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ); // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index 28fb1f857..82383f93a 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -115,12 +115,13 @@ void bli_l3_packm // buffer, then a block has already been acquired from the memory // broker and cached in the control tree. - // BUT, we need to make sure that the mem_t object is not associated - // with a block that is too small given the size of the packed matrix - // that we need, according to the return value from packm_init(). + // As a sanity check, we should make sure that the mem_t object isn't + // associated with a block that is too small compared to the size of + // the packed matrix buffer that is needed, according to the return + // value from packm_init(). siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); - if ( size_needed < cntl_mem_size ) + if ( cntl_mem_size < size_needed ) { if ( bli_thread_am_ochief( thread ) ) { diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index f908bbb64..a14c543d8 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -40,11 +40,11 @@ void bli_l3_prune_unref_mparts_m obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ) { // Query the operation family. - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); @@ -61,11 +61,11 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ obj_t* a, \ obj_t* b, \ obj_t* c, \ - cntx_t* cntx \ + cntl_t* cntl \ ) \ { \ /* Query the operation family. */ \ - opid_t family = bli_cntx_family( cntx ); \ + opid_t family = bli_cntl_family( cntl ); \ \ if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index 13d661ff1..6905e21f4 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -41,7 +41,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ obj_t* a, \ obj_t* b, \ obj_t* c, \ - cntx_t* cntx \ + cntl_t* cntl \ ); GENPROT( m ) diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 1a5693d8c..8fc062da2 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var1 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_mdim diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index a65f8a20a..ff2a570db 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var2 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_ndim diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 0148428df..64ab573da 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var3 dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); @@ -66,7 +66,7 @@ void bli_gemm_blk_var3 { // Determine the current algorithmic blocksize. b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); + bli_cntl_bszid( cntl ), cntx, cntl ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, @@ -109,7 +109,7 @@ void bli_gemm_blk_var3 // row-panel of C, and thus beta is applied to all of C exactly once. // Thus, for neither trmm nor trmm3 should we reset the scalar on C // after the first iteration. - if ( bli_cntx_get_family( cntx ) != BLIS_TRMM ) + if ( bli_cntl_family( cntl ) != BLIS_TRMM ) if ( i == 0 ) bli_obj_scalar_reset( c ); } } diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 775ca2544..b17ce10ac 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -56,22 +56,24 @@ cntl_t* bli_gemmbp_cntl_create else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. - cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, @@ -86,15 +88,16 @@ cntl_t* bli_gemmbp_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node ( + family, BLIS_MC, bli_gemm_blk_var1, gemm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, @@ -109,16 +112,18 @@ cntl_t* bli_gemmbp_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( + family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( + family, BLIS_NC, bli_gemm_blk_var2, gemm_cntl_mm_op @@ -141,15 +146,17 @@ cntl_t* bli_gemmpb_cntl_create //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. - cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_ub_ke @@ -157,7 +164,7 @@ cntl_t* bli_gemmpb_cntl_create // Create a node for packing matrix A (which is really the right-hand // operand "B"). - cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, @@ -172,8 +179,9 @@ cntl_t* bli_gemmpb_cntl_create ); // Create a node for partitioning the n dimension by MC. - cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node ( + family, BLIS_MC, bli_gemm_blk_var2, gemm_cntl_packb @@ -181,7 +189,7 @@ cntl_t* bli_gemmpb_cntl_create // Create a node for packing matrix B (which is really the left-hand // operand "A"). - cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, @@ -196,16 +204,18 @@ cntl_t* bli_gemmpb_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( + family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packa ); // Create a node for partitioning the m dimension by NC. - cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( + family, BLIS_NC, bli_gemm_blk_var1, gemm_cntl_mm_op @@ -227,13 +237,14 @@ void bli_gemm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_gemm_cntl_obj_create +cntl_t* bli_gemm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 6da6cd768..3b643e1fc 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -59,8 +59,9 @@ void bli_gemm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_gemm_cntl_obj_create +cntl_t* bli_gemm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index d4b0bde6e..508f1264d 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -46,70 +46,68 @@ void bli_gemm_front cntl_t* cntl ) { + #ifdef BLIS_SMALL_MATRIX_ENABLE gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl); if(BLIS_SUCCESS != status) #endif { - obj_t a_local; - obj_t b_local; - obj_t c_local; + obj_t a_local; + obj_t b_local; + obj_t c_local; - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_gemm_check( alpha, a, b, beta, c, cntx ); + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } + // If alpha is zero, scale by beta and return. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) + { + bli_scalm( beta, c ); + return; + } - // Reinitialize the memory allocator to accommodate the blocksizes - // in the current context. - bli_memsys_reinit( cntx ); + // Reinitialize the memory allocator to accommodate the blocksizes + // in the current context. + bli_memsys_reinit( cntx ); - // Alias A, B, and C in case we need to apply transformations. - bli_obj_alias_to( *a, a_local ); - bli_obj_alias_to( *b, b_local ); - bli_obj_alias_to( *c, c_local ); + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( *a, a_local ); + bli_obj_alias_to( *b, b_local ); + bli_obj_alias_to( *c, c_local ); - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_swap( a_local, b_local ); + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( a_local, b_local ); - bli_obj_induce_trans( a_local ); - bli_obj_induce_trans( b_local ); - bli_obj_induce_trans( c_local ); - } + bli_obj_induce_trans( a_local ); + bli_obj_induce_trans( b_local ); + bli_obj_induce_trans( c_local ); + } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, + bli_obj_length( c_local ), + bli_obj_width( c_local ), + bli_obj_width( a_local ) ); - // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, - bli_obj_length( c_local ), - bli_obj_width( c_local ), - bli_obj_width( a_local ) ); - - // Invoke the internal back-end via the thread handler. - bli_l3_thread_decorator - ( - bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - cntx, - cntl - ); + // Invoke the internal back-end via the thread handler. + bli_l3_thread_decorator + ( + bli_gemm_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl + ); } - } diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 340aa7edc..8d7f8d635 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -89,9 +89,6 @@ void bli_hemm_front bli_obj_swap( a_local, b_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -102,6 +99,7 @@ void bli_hemm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index c6851d2a4..e203d59ba 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -107,9 +107,6 @@ void bli_her2k_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -122,6 +119,7 @@ void bli_her2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &bh_local, @@ -134,6 +132,7 @@ void bli_her2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id &alpha_conj, &b_local, &ah_local, diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 642be0d99..227b97d5d 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -87,9 +87,6 @@ void bli_herk_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -100,6 +97,7 @@ void bli_herk_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &ah_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 57aa11f73..a01ed15cf 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -88,9 +88,6 @@ void bli_symm_front bli_obj_swap( a_local, b_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -101,6 +98,7 @@ void bli_symm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index f64a765e5..459cdbdd0 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -88,9 +88,6 @@ void bli_syr2k_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -103,6 +100,7 @@ void bli_syr2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &bt_local, @@ -115,6 +113,7 @@ void bli_syr2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &b_local, &at_local, diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 42d135659..eba91cfd9 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -81,9 +81,6 @@ void bli_syrk_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -94,6 +91,7 @@ void bli_syrk_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &at_local, diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index b44ddfcff..75549e2d0 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -131,9 +131,6 @@ void bli_trmm_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx, bli_obj_length( c_local ), @@ -144,6 +141,7 @@ void bli_trmm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_TRMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index e672f7af3..f89b6ad96 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -130,9 +130,6 @@ void bli_trmm3_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx, bli_obj_length( c_local ), @@ -143,6 +140,7 @@ void bli_trmm3_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_TRMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index a731d8265..67b046952 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var1 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_mdim diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index a133f0bb0..48e4b4f1c 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var2 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_ndim diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 7b428c8ef..d4e809c50 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var3 dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 78bd5eeb9..e05fc3d20 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -50,23 +50,27 @@ cntl_t* bli_trsm_l_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; + const opid_t family = BLIS_TRSM; + // Create two nodes for the macro-kernel. - cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, bli_packm_blk_var1, @@ -81,15 +85,16 @@ cntl_t* bli_trsm_l_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + family, BLIS_MC, bli_trsm_blk_var1, trsm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, bli_packm_blk_var1, @@ -104,16 +109,18 @@ cntl_t* bli_trsm_l_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op @@ -129,23 +136,27 @@ cntl_t* bli_trsm_r_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; + const opid_t family = BLIS_TRSM; + // Create two nodes for the macro-kernel. - cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + family, BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, bli_packm_blk_var1, @@ -160,15 +171,16 @@ cntl_t* bli_trsm_r_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + family, BLIS_MC, bli_trsm_blk_var1, trsm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, bli_packm_blk_var1, @@ -183,16 +195,18 @@ cntl_t* bli_trsm_r_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op @@ -212,13 +226,14 @@ void bli_trsm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_trsm_cntl_obj_create +cntl_t* bli_trsm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 6dbe9adce..cfd20cad3 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -55,8 +55,9 @@ void bli_trsm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_trsm_cntl_obj_create +cntl_t* bli_trsm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 42bda8a51..47cff8b48 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -122,9 +122,6 @@ void bli_trsm_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRSM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx, bli_obj_length( c_local ), @@ -135,6 +132,7 @@ void bli_trsm_front bli_l3_thread_decorator ( bli_trsm_int, + BLIS_TRSM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trsm/old/bli_trsm_cntl.c b/frame/3/trsm/old/bli_trsm_cntl.c index 3a83faafd..de018d64a 100644 --- a/frame/3/trsm/old/bli_trsm_cntl.c +++ b/frame/3/trsm/old/bli_trsm_cntl.c @@ -64,7 +64,7 @@ void bli_trsm_cntl_init() // Create control tree objects for packm operations (left side). trsm_l_packa_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases @@ -78,7 +78,7 @@ void bli_trsm_cntl_init() trsm_l_packb_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // IMPORTANT: m dim multiple must be mr since // B_pack is updated (ie: serves as C) in trsm @@ -93,7 +93,7 @@ void bli_trsm_cntl_init() // Create control tree objects for packm operations (right side). trsm_r_packa_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_NR, BLIS_MR, @@ -105,7 +105,7 @@ void bli_trsm_cntl_init() trsm_r_packb_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // pack panels of B compactly BLIS_MR, BLIS_MR, @@ -119,7 +119,7 @@ void bli_trsm_cntl_init() // Create control tree object for lowest-level block-panel kernel. trsm_cntl_bp_ke = - bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + bli_trsm_cntl_create_node( BLIS_UNB_OPT, BLIS_VARIANT2, 0, // bszid_t not used by macro-kernel NULL, NULL, NULL, NULL, @@ -129,7 +129,7 @@ void bli_trsm_cntl_init() // problem (left side). trsm_l_cntl_op_bp = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_MC, NULL, @@ -144,7 +144,7 @@ void bli_trsm_cntl_init() // rank-k (outer panel) updates (left side). trsm_l_cntl_mm_op = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT3, BLIS_KC, NULL, @@ -159,7 +159,7 @@ void bli_trsm_cntl_init() // general problems (left side). trsm_l_cntl_vl_mm = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_NC, NULL, @@ -174,7 +174,7 @@ void bli_trsm_cntl_init() // problem (right side). trsm_r_cntl_op_bp = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_MC, NULL, @@ -189,7 +189,7 @@ void bli_trsm_cntl_init() // rank-k (outer panel) updates (right side). trsm_r_cntl_mm_op = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT3, BLIS_KC, NULL, @@ -204,7 +204,7 @@ void bli_trsm_cntl_init() // general problems (right side). trsm_r_cntl_vl_mm = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_NC, NULL, @@ -222,22 +222,22 @@ void bli_trsm_cntl_init() void bli_trsm_cntl_finalize() { - bli_cntl_obj_free( trsm_l_packa_cntl ); - bli_cntl_obj_free( trsm_l_packb_cntl ); - bli_cntl_obj_free( trsm_r_packa_cntl ); - bli_cntl_obj_free( trsm_r_packb_cntl ); + bli_cntl_free_node( trsm_l_packa_cntl ); + bli_cntl_free_node( trsm_l_packb_cntl ); + bli_cntl_free_node( trsm_r_packa_cntl ); + bli_cntl_free_node( trsm_r_packb_cntl ); - bli_cntl_obj_free( trsm_cntl_bp_ke ); + bli_cntl_free_node( trsm_cntl_bp_ke ); - bli_cntl_obj_free( trsm_l_cntl_op_bp ); - bli_cntl_obj_free( trsm_l_cntl_mm_op ); - bli_cntl_obj_free( trsm_l_cntl_vl_mm ); - bli_cntl_obj_free( trsm_r_cntl_op_bp ); - bli_cntl_obj_free( trsm_r_cntl_mm_op ); - bli_cntl_obj_free( trsm_r_cntl_vl_mm ); + bli_cntl_free_node( trsm_l_cntl_op_bp ); + bli_cntl_free_node( trsm_l_cntl_mm_op ); + bli_cntl_free_node( trsm_l_cntl_vl_mm ); + bli_cntl_free_node( trsm_r_cntl_op_bp ); + bli_cntl_free_node( trsm_r_cntl_mm_op ); + bli_cntl_free_node( trsm_r_cntl_vl_mm ); } -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, +trsm_t* bli_trsm_cntl_create_node( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalm_t* sub_scalm, diff --git a/frame/3/trsm/old/bli_trsm_cntl.h b/frame/3/trsm/old/bli_trsm_cntl.h index 651cc8599..bcdd1dfc7 100644 --- a/frame/3/trsm/old/bli_trsm_cntl.h +++ b/frame/3/trsm/old/bli_trsm_cntl.h @@ -51,7 +51,7 @@ typedef struct trsm_s trsm_t; void bli_trsm_cntl_init( void ); void bli_trsm_cntl_finalize( void ); -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, +trsm_t* bli_trsm_cntl_create_node( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalm_t* sub_scalm, diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 0f8e38688..6d27c52d5 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -35,7 +35,7 @@ #include "blis.h" -blksz_t* bli_blksz_obj_create +blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, @@ -47,16 +47,39 @@ blksz_t* bli_blksz_obj_create b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) ); - bli_blksz_obj_init( b, - b_s, be_s, - b_d, be_d, - b_c, be_c, - b_z, be_z ); + bli_blksz_init_ed + ( + b, + b_s, be_s, + b_d, be_d, + b_c, be_c, + b_z, be_z + ); return b; } -void bli_blksz_obj_init +blksz_t* bli_blksz_create + ( + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ) +{ + blksz_t* b; + + b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) ); + + bli_blksz_init + ( + b, + b_s, b_d, b_c, b_z, + be_s, be_d, be_c, be_z + ); + + return b; +} + +void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, @@ -75,7 +98,45 @@ void bli_blksz_obj_init b->e[BLIS_DCOMPLEX] = be_z; } -void bli_blksz_obj_free +void bli_blksz_init + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ) +{ + b->v[BLIS_FLOAT] = b_s; + b->v[BLIS_DOUBLE] = b_d; + b->v[BLIS_SCOMPLEX] = b_c; + b->v[BLIS_DCOMPLEX] = b_z; + + // Interpret a zero as a request for the default value. + b->e[BLIS_FLOAT] = ( be_s == 0 ? b_s : be_s ); + b->e[BLIS_DOUBLE] = ( be_d == 0 ? b_d : be_d ); + b->e[BLIS_SCOMPLEX] = ( be_c == 0 ? b_c : be_c ); + b->e[BLIS_DCOMPLEX] = ( be_z == 0 ? b_z : be_z ); +} + +void bli_blksz_init_easy + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z + ) +{ + b->v[BLIS_FLOAT] = b_s; + b->v[BLIS_DOUBLE] = b_d; + b->v[BLIS_SCOMPLEX] = b_c; + b->v[BLIS_DCOMPLEX] = b_z; + + // Here we assume the maximum blocksize values can be the same as the + // default values. + b->e[BLIS_FLOAT] = b_s; + b->e[BLIS_DOUBLE] = b_d; + b->e[BLIS_SCOMPLEX] = b_c; + b->e[BLIS_DCOMPLEX] = b_z; +} + +void bli_blksz_free ( blksz_t* b ) @@ -302,6 +363,11 @@ dim_t bli_determine_blocksize_b_sub // chunk that will correspond to the blocksize we are computing now. dim_left_now = dim - i; + // Sanity check: if dim_left_now is zero, then we can return zero + // without going any further. + if ( dim_left_now == 0 ) + return 0; + dim_at_edge = dim_left_now % b_alg; // If dim_left_now is a multiple of b_alg, we can safely return b_alg diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index cfe2023e1..abd066f88 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -50,15 +50,6 @@ *(max) = bli_blksz_get_max( dt, b ); \ } -#define bli_blksz_get_def_for_obj( obj, b ) \ -\ - bli_blksz_get_def( bli_obj_datatype( *(obj) ), b ) - -#define bli_blksz_get_max_for_obj( obj, b ) \ -\ - bli_blksz_get_max( bli_obj_datatype( *(obj) ), b ) - - // blksz_t modification #define bli_blksz_set_def( val, dt, b ) \ @@ -85,8 +76,11 @@ #define bli_blksz_copy_dt( dt_src, b_src, \ dt_dst, b_dst ) \ { \ - (b_dst)->v[ dt_dst ] = (b_src)->v[ dt_src ]; \ - (b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \ + const dim_t v_src = bli_blksz_get_def( dt_src, b_src ); \ + const dim_t e_src = bli_blksz_get_max( dt_src, b_src ); \ +\ + bli_blksz_set_def( v_src, dt_dst, b_dst ); \ + bli_blksz_set_max( e_src, dt_dst, b_dst ); \ } #define bli_blksz_scale_def( num, den, dt, b ) \ @@ -109,7 +103,7 @@ // ----------------------------------------------------------------------------- -blksz_t* bli_blksz_obj_create +blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, @@ -117,7 +111,13 @@ blksz_t* bli_blksz_obj_create dim_t b_z, dim_t be_z ); -void bli_blksz_obj_init +blksz_t* bli_blksz_create + ( + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ); + +void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, @@ -126,7 +126,20 @@ void bli_blksz_obj_init dim_t b_z, dim_t be_z ); -void bli_blksz_obj_free +void bli_blksz_init + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ); + +void bli_blksz_init_easy + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z + ); + +void bli_blksz_free ( blksz_t* b ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index cac290da9..90b2634a5 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -34,8 +34,9 @@ #include "blis.h" -cntl_t* bli_cntl_obj_create +cntl_t* bli_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, void* params, @@ -48,6 +49,7 @@ cntl_t* bli_cntl_obj_create // Allocate the cntl_t struct. cntl = bli_malloc_intl( sizeof( cntl_t ) ); + bli_cntl_set_family( family, cntl ); bli_cntl_set_bszid( bszid, cntl ); bli_cntl_set_var_func( var_func, cntl ); bli_cntl_set_params( params, cntl ); @@ -63,7 +65,7 @@ cntl_t* bli_cntl_obj_create return cntl; } -void bli_cntl_obj_free +void bli_cntl_free_node ( cntl_t* cntl ) @@ -71,7 +73,7 @@ void bli_cntl_obj_free bli_free_intl( cntl ); } -void bli_cntl_obj_clear +void bli_cntl_clear_node ( cntl_t* cntl ) @@ -141,7 +143,7 @@ void bli_cntl_free_w_thrinfo } // Free the current node. - bli_cntl_obj_free( cntl ); + bli_cntl_free_node( cntl ); } void bli_cntl_free_wo_thrinfo @@ -177,7 +179,7 @@ void bli_cntl_free_wo_thrinfo } // Free the current node. - bli_cntl_obj_free( cntl ); + bli_cntl_free_node( cntl ); } // ----------------------------------------------------------------------------- @@ -189,10 +191,11 @@ cntl_t* bli_cntl_copy { // Make a copy of the current node. Notice that the source node // should NOT have any allocated/cached mem_t entries, and that - // bli_cntl_obj_create() creates a node with a cleared mem_t + // bli_cntl_create_node() creates a node with a cleared mem_t // field. - cntl_t* cntl_copy = bli_cntl_obj_create + cntl_t* cntl_copy = bli_cntl_create_node ( + bli_cntl_family( cntl ), bli_cntl_bszid( cntl ), bli_cntl_var_func( cntl ), NULL, NULL @@ -234,3 +237,23 @@ cntl_t* bli_cntl_copy return cntl_copy; } +void bli_cntl_mark_family + ( + opid_t family, + cntl_t* cntl + ) +{ + // Set the family of the root node. + bli_cntl_set_family( family, cntl ); + + // Continue as long as the current node has a valid child. + while ( bli_cntl_sub_node( cntl ) != NULL ) + { + // Move down the tree to the child node. + cntl = bli_cntl_sub_node( cntl ); + + // Set the family of the current node. + bli_cntl_set_family( family, cntl ); + } +} + diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index fd0413f4f..332a6cd70 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -39,6 +39,7 @@ struct cntl_s { // Basic fields (usually required). + opid_t family; bszid_t bszid; void* var_func; struct cntl_s* sub_node; @@ -57,20 +58,21 @@ typedef struct cntl_s cntl_t; // -- Control tree prototypes -- -cntl_t* bli_cntl_obj_create +cntl_t* bli_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, void* params, cntl_t* sub_node ); -void bli_cntl_obj_free +void bli_cntl_free_node ( cntl_t* cntl ); -void bli_cntl_obj_clear +void bli_cntl_clear_node ( cntl_t* cntl ); @@ -99,10 +101,20 @@ cntl_t* bli_cntl_copy cntl_t* cntl ); +void bli_cntl_mark_family + ( + opid_t family, + cntl_t* cntl + ); + // ----------------------------------------------------------------------------- // cntl_t query (fields only) +#define bli_cntl_family( cntl ) \ +\ + ( cntl->family ) + #define bli_cntl_bszid( cntl ) \ \ ( cntl->bszid ) @@ -139,6 +151,11 @@ cntl_t* bli_cntl_copy // cntl_t modification +#define bli_cntl_set_family( family0, cntl ) \ +{ \ + cntl->family = family0; \ +} + #define bli_cntl_set_bszid( bszid0, cntl ) \ { \ cntl->bszid = bszid0; \ diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 673987bfd..d4c4487ed 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -39,14 +39,14 @@ // NOTE: Since these functions currently do nothing, they are defined // as empty macros in bli_cntx. // -void bli_cntx_obj_create( cntx_t* cntx ) +void bli_cntx_create( cntx_t* cntx ) { // Since cntx_t objects contain statically-allocated arrays, // we don't need to do anything in order to create the cntx_t // instance. } -void bli_cntx_obj_free( cntx_t* cntx ) +void bli_cntx_free( cntx_t* cntx ) { // Just as we don't need to do anything in order to create a // cntx_t instance, we don't need to do anything to destory @@ -54,7 +54,7 @@ void bli_cntx_obj_free( cntx_t* cntx ) } #endif -void bli_cntx_obj_clear( cntx_t* cntx ) +void bli_cntx_clear( cntx_t* cntx ) { // Fill the entire cntx_t structure with zeros. memset( ( void* )cntx, 0, sizeof( cntx ) ); @@ -108,8 +108,11 @@ void bli_cntx_init( cntx_t* cntx ) // ----------------------------------------------------------------------------- -blksz_t* bli_cntx_get_blksz( bszid_t bs_id, - cntx_t* cntx ) +blksz_t* bli_cntx_get_blksz + ( + bszid_t bs_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; @@ -142,8 +145,11 @@ dim_t bli_cntx_get_blksz_max_dt( num_t dt, } #endif -blksz_t* bli_cntx_get_bmult( bszid_t bs_id, - cntx_t* cntx ) +blksz_t* bli_cntx_get_bmult + ( + bszid_t bs_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); @@ -166,8 +172,11 @@ dim_t bli_cntx_get_bmult_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); @@ -210,8 +219,11 @@ void* bli_cntx_get_l3_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_vir_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_vir_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* l3_vir_ukr = &l3_vir_ukrs[ ukr_id ]; @@ -235,8 +247,11 @@ void* bli_cntx_get_l3_vir_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_nat_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_nat_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* l3_nat_ukr = &l3_nat_ukrs[ ukr_id ]; @@ -260,8 +275,11 @@ void* bli_cntx_get_l3_nat_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l1f_ker + ( + l1fkr_t ker_id, + cntx_t* cntx + ) { func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx ); func_t* l1f_ker = &l1f_kers[ ker_id ]; @@ -283,8 +301,11 @@ void* bli_cntx_get_l1f_ker_dt( num_t dt, } #endif -func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l1v_ker + ( + l1vkr_t ker_id, + cntx_t* cntx + ) { func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx ); func_t* l1v_ker = &l1v_kers[ ker_id ]; @@ -306,8 +327,11 @@ void* bli_cntx_get_l1v_ker_dt( num_t dt, } #endif -mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, - cntx_t* cntx ) +mbool_t* bli_cntx_get_l3_nat_ukr_prefs + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* l3_nat_ukrs_pref = &l3_nat_ukrs_prefs[ ukr_id ]; @@ -316,12 +340,30 @@ mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, return l3_nat_ukrs_pref; } -func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ) +func_t* bli_cntx_get_packm_ker + ( + l1mkr_t ker_id, + cntx_t* cntx + ) { - func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx ); + func_t* packm_kers = bli_cntx_packm_kers_buf( cntx ); + func_t* packm_ker = &packm_kers[ ker_id ]; // Return the address of the func_t that contains the packm ukernels. - return packm_ukrs; + return packm_ker; +} + +func_t* bli_cntx_get_unpackm_ker + ( + l1mkr_t ker_id, + cntx_t* cntx + ) +{ + func_t* unpackm_kers = bli_cntx_unpackm_kers_buf( cntx ); + func_t* unpackm_ker = &unpackm_kers[ ker_id ]; + + // Return the address of the func_t that contains the unpackm ukernels. + return unpackm_ker; } #if 0 @@ -360,7 +402,11 @@ dim_t bli_cntx_get_num_threads( cntx_t* cntx ) bli_cntx_ir_way( cntx ); } -dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) +dim_t bli_cntx_get_num_threads_in + ( + cntx_t* cntx, + cntl_t* cntl + ) { dim_t n_threads_in = 1; @@ -384,14 +430,6 @@ dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) // ----------------------------------------------------------------------------- -#if 1 -// -// NOTE: This function is disabled because: -// - we currently do not have any need to set a context direclty with -// blksz_t objects -// - it may be broken; it needs to be synced up with the corresponding -// function in bli_gks.c. -// void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: @@ -454,8 +492,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, // - the address of the blksz_t object, and - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. + // - the bszid_t of the multiple + // that we need to associate with the blksz_t object. bszid_t bs_id = va_arg( args, bszid_t ); blksz_t* blksz = va_arg( args, blksz_t* ); bszid_t bm_id = va_arg( args, bszid_t ); @@ -473,9 +511,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object, and - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. + // - the address of the blksz_t object, + // - the bszid_t of the multiple, and // - the scalars we wish to apply to the real blocksizes to // come up with the induced complex blocksizes (for default // and maximum blocksizes). @@ -536,6 +573,7 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // location within the context's blksz_t array. Do the same // for the blocksize multiple id. //cntx_blkszs[ bs_id ] = *blksz; + //bli_blksz_copy_smart( blksz, cntx_blksz ); bli_blksz_copy( blksz, cntx_blksz ); // Copy the blocksize multiple id into the context. @@ -624,14 +662,16 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_free_intl( dsclrs ); bli_free_intl( msclrs ); } -#endif // ----------------------------------------------------------------------------- -void bli_cntx_set_blksz( bszid_t bs_id, - blksz_t* blksz, - bszid_t mult_id, - cntx_t* cntx ) +void bli_cntx_set_blksz + ( + bszid_t bs_id, + blksz_t* blksz, + bszid_t mult_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); @@ -645,20 +685,111 @@ void bli_cntx_set_blksz( bszid_t bs_id, bmults[ bs_id ] = mult_id; } -void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ) -{ - func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); +// ----------------------------------------------------------------------------- - // Copy the function object into the specified location within - // the context's virtual level-3 ukernel array. - l3_vir_ukrs[ ukr_id ] = *func; +void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) +{ + /* Example prototypes: + + void bli_cntx_set_l3_nat_ukrs + ( + dim_t n_ukrs, + l3ukr_t ukr0_id, num_t dt0, void* ukr0_fp, bool_t pref0, + l3ukr_t ukr1_id, num_t dt1, void* ukr1_fp, bool_t pref1, + l3ukr_t ukr2_id, num_t dt2, void* ukr2_fp, bool_t pref2, + ... + cntx_t* cntx + ); + */ + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) ); + num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) ); + bool_t* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool_t ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_ukrs ); + + // Process n_ukrs tuples. + for ( i = 0; i < n_ukrs; ++i ) + { + // Here, we query the variable argument list for: + // - the l3ukr_t of the kernel we're about to process, + // - the datatype of the kernel, + // - the kernel function pointer, and + // - the kernel function storage preference + // that we need to store to the context. + const l3ukr_t ukr_id = va_arg( args, l3ukr_t ); + const num_t ukr_dt = va_arg( args, num_t ); + void* ukr_fp = va_arg( args, void* ); + const bool_t ukr_pref = va_arg( args, bool_t ); + + // Store the values in our temporary arrays. + ukr_ids[ i ] = ukr_id; + ukr_dts[ i ] = ukr_dt; + ukr_fps[ i ] = ukr_fp; + ukr_prefs[ i ] = ukr_pref; + } + + // The last argument should be the context pointer. + cntx_t* cntx = va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the addresses of: + // - the l3 native ukernel func_t array + // - the l3 native ukernel preferences array + func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); + mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_ukrs; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + const l3ukr_t ukr_id = ukr_ids[ i ]; + const num_t ukr_dt = ukr_dts[ i ]; + void* ukr_fp = ukr_fps[ i ]; + const bool_t ukr_pref = ukr_prefs[ i ]; + + // Index into the func_t and mbool_t for the current kernel id + // being processed. + func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ]; + mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ]; + + // Store the ukernel function pointer and preference values into + // the context. + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); + bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); + } + + // Free the temporary local arrays. + bli_free_intl( ukr_ids ); + bli_free_intl( ukr_dts ); + bli_free_intl( ukr_fps ); + bli_free_intl( ukr_prefs ); } -void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ) +// ----------------------------------------------------------------------------- + +void bli_cntx_set_l3_nat_ukr + ( + l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx + ) { func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); @@ -667,9 +798,12 @@ void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, l3_nat_ukrs[ ukr_id ] = *func; } -void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, - mbool_t* prefs, - cntx_t* cntx ) +void bli_cntx_set_l3_nat_ukr_prefs + ( + l3ukr_t ukr_id, + mbool_t* prefs, + cntx_t* cntx + ) { mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); @@ -678,9 +812,26 @@ void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, l3_nat_ukrs_prefs[ ukr_id ] = *prefs; } -void bli_cntx_set_l1f_ker( l1fkr_t ker_id, - func_t* func, - cntx_t* cntx ) +void bli_cntx_set_l3_vir_ukr + ( + l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx + ) +{ + func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); + + // Copy the function object into the specified location within + // the context's virtual level-3 ukernel array. + l3_vir_ukrs[ ukr_id ] = *func; +} + +void bli_cntx_set_l1f_ker + ( + l1fkr_t ker_id, + func_t* func, + cntx_t* cntx + ) { func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx ); @@ -689,9 +840,12 @@ void bli_cntx_set_l1f_ker( l1fkr_t ker_id, l1f_kers[ ker_id ] = *func; } -void bli_cntx_set_l1v_ker( l1vkr_t ker_id, - func_t* func, - cntx_t* cntx ) +void bli_cntx_set_l1v_ker + ( + l1vkr_t ker_id, + func_t* func, + cntx_t* cntx + ) { func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx ); @@ -700,43 +854,154 @@ void bli_cntx_set_l1v_ker( l1vkr_t ker_id, l1v_kers[ ker_id ] = *func; } -void bli_cntx_set_packm_ukr( func_t* func, - cntx_t* cntx ) -{ - func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx ); +// ----------------------------------------------------------------------------- - // Copy the function object into the context's packm ukernel object. - *packm_ukrs = *func; +void bli_cntx_set_packm_kers( dim_t n_kers, ... ) +{ + /* Example prototypes: + + void bli_cntx_set_packm_kers + ( + dim_t n_ukrs, + l1mkr_t ker0_id, num_t ker0_dt, void* ker0_fp, + l1mkr_t ker1_id, num_t ker1_dt, void* ker1_fp, + l1mkr_t ker2_id, num_t ker2_dt, void* ker2_fp, + ... + cntx_t* cntx + ); + */ + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ) ); + num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_kers ); + + // Process n_kers tuples. + for ( i = 0; i < n_kers; ++i ) + { + // Here, we query the variable argument list for: + // - the l1mkr_t of the kernel we're about to process, + // - the datatype of the kernel, and + // - the kernel function pointer + // that we need to store to the context. + const l1mkr_t ker_id = va_arg( args, l1mkr_t ); + const num_t ker_dt = va_arg( args, num_t ); + void* ker_fp = va_arg( args, void* ); + + // Store the values in our temporary arrays. + ker_ids[ i ] = ker_id; + ker_dts[ i ] = ker_dt; + ker_fps[ i ] = ker_fp; + } + + // The last argument should be the context pointer. + cntx_t* cntx = va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the address of: + // - the packm kernels func_t array + func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_kers; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + const l1mkr_t ker_id = ker_ids[ i ]; + const num_t ker_dt = ker_dts[ i ]; + void* ker_fp = ker_fps[ i ]; + + // Index into the func_t and mbool_t for the current kernel id + // being processed. + func_t* kers = &cntx_packm_kers[ ker_id ]; + + // Store the ukernel function pointer and preference values into + // the context. + bli_func_set_dt( ker_fp, ker_dt, kers ); + } + + // Free the temporary local arrays. + bli_free_intl( ker_ids ); + bli_free_intl( ker_dts ); + bli_free_intl( ker_fps ); } -void bli_cntx_set_ind_method( ind_t method, - cntx_t* cntx ) +// ----------------------------------------------------------------------------- + +void bli_cntx_set_packm_ker + ( + l1mkr_t ker_id, + func_t* func, + cntx_t* cntx + ) +{ + func_t* packm_kers = bli_cntx_packm_kers_buf( cntx ); + + // Copy the function object into the specified location within + // the context's packm kernel array. + packm_kers[ ker_id ] = *func; +} + +// ----------------------------------------------------------------------------- + +void bli_cntx_set_ind_method + ( + ind_t method, + cntx_t* cntx + ) { bli_cntx_set_method( method, cntx ); } -void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_ab_blockpanel + ( + pack_t schema_a, + pack_t schema_b, + cntx_t* cntx + ) { bli_cntx_set_schema_a_block( schema_a, cntx ); bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_a_block( pack_t schema_a, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_a_block + ( + pack_t schema_a, + cntx_t* cntx + ) { bli_cntx_set_schema_a_block( schema_a, cntx ); } -void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_b_panel + ( + pack_t schema_b, + cntx_t* cntx + ) { bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_c_panel + ( + pack_t schema_c, + cntx_t* cntx + ) { bli_cntx_set_schema_c_panel( schema_c, cntx ); } @@ -749,17 +1014,24 @@ void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, } #endif -void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, - dim_t m, dim_t n, dim_t k ) +void bli_cntx_set_thrloop_from_env + ( + opid_t l3_op, + side_t side, + cntx_t* cntx, + dim_t m, + dim_t n, + dim_t k + ) { dim_t jc, pc, ic, jr, ir; #ifdef BLIS_ENABLE_MULTITHREADING - int nthread = bli_env_read_nway( "BLIS_NUM_THREADS", -1 ); + int nthread = bli_thread_get_env( "BLIS_NUM_THREADS", -1 ); if ( nthread == -1 ) - nthread = bli_env_read_nway( "OMP_NUM_THREADS", -1 ); + nthread = bli_thread_get_env( "OMP_NUM_THREADS", -1 ); if ( nthread < 1 ) nthread = 1; @@ -786,10 +1058,10 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, pc = 1; - dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 ); - dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 ); - dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 ); - dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 ); + dim_t jc_env = bli_thread_get_env( "BLIS_JC_NT", -1 ); + dim_t ic_env = bli_thread_get_env( "BLIS_IC_NT", -1 ); + dim_t jr_env = bli_thread_get_env( "BLIS_JR_NT", -1 ); + dim_t ir_env = bli_thread_get_env( "BLIS_IR_NT", -1 ); if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1) { @@ -882,9 +1154,12 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, // ----------------------------------------------------------------------------- -bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs ); @@ -894,9 +1169,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, return ukr_prefs == TRUE; } -bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs ); @@ -906,16 +1184,22 @@ bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, return ukr_prefs == FALSE; } -bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { return !bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); } -bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { const num_t dt = bli_obj_datatype( *obj ); const bool_t ukr_prefers_rows @@ -930,9 +1214,12 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); @@ -942,9 +1229,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); @@ -956,9 +1246,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, // ----------------------------------------------------------------------------- -bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_rows_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { // Reference the ukr storage preferences of the corresponding real // micro-kernel for induced methods. @@ -968,9 +1261,12 @@ bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_cols_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { // Reference the ukr storage preferences of the corresponding real // micro-kernel for induced methods. @@ -980,16 +1276,22 @@ bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { num_t dt = bli_obj_datatype( *obj ); @@ -1005,9 +1307,12 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); @@ -1017,9 +1322,12 @@ bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); @@ -1108,23 +1416,6 @@ void bli_cntx_print( cntx_t* cntx ) ); } - { - func_t* ukr = bli_cntx_get_packm_ukr( cntx ); - - printf( "packm ker : %16p %16p %16p %16p\n", - bli_func_get_dt( BLIS_FLOAT, ukr ), - bli_func_get_dt( BLIS_DOUBLE, ukr ), - bli_func_get_dt( BLIS_SCOMPLEX, ukr ), - bli_func_get_dt( BLIS_DCOMPLEX, ukr ) - ); - } - - { - ind_t family = bli_cntx_get_family( cntx ); - - printf( "oper family : %lu\n", ( guint_t )family ); - } - { ind_t method = bli_cntx_get_ind_method( cntx ); diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index a76cdd329..3167d1bf4 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -36,6 +36,9 @@ #ifndef BLIS_CNTX_H #define BLIS_CNTX_H +//#include "bli_cntx_init.h" + + // Context object type (defined in bli_type_defs.h) /* @@ -51,9 +54,9 @@ typedef struct cntx_s func_t* l1f_kers; func_t* l1v_kers; - func_t packm_ukrs; + func_t* packm_kers; + func_t* unpackm_kers; - opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; @@ -99,17 +102,13 @@ typedef struct cntx_s \ ( (cntx)->l1v_kers ) -#define bli_cntx_packm_ukrs_buf( cntx ) \ +#define bli_cntx_packm_kers_buf( cntx ) \ \ - (&((cntx)->packm_ukrs) ) + ( (cntx)->packm_kers ) -#define bli_cntx_packm_ukrs( cntx ) \ +#define bli_cntx_unpackm_kers_buf( cntx ) \ \ - (&((cntx)->packm_ukrs) ) - -#define bli_cntx_family( cntx ) \ -\ - ( (cntx)->family ) + ( (cntx)->unpackm_kers ) #define bli_cntx_method( cntx ) \ \ @@ -202,16 +201,6 @@ typedef struct cntx_s (cntx_p)->l1v_kers = _l1v_kers; \ } -#define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \ -{ \ - (cntx_p)->packm_ukrs = _packm_ukrs; \ -} - -#define bli_cntx_set_family( _family, cntx_p ) \ -{ \ - (cntx_p)->family = _family; \ -} - #define bli_cntx_set_method( _method, cntx_p ) \ { \ (cntx_p)->method = _method; \ @@ -285,7 +274,8 @@ typedef struct cntx_s ( \ (dt), \ &(( \ - bli_cntx_method( (cntx) ) != BLIS_NAT \ + bli_cntx_method( (cntx) ) != BLIS_NAT && \ + bli_is_complex( dt ) \ ? bli_cntx_l3_vir_ukrs_buf( (cntx) ) \ : bli_cntx_l3_nat_ukrs_buf( (cntx) ) \ )[ ukr_id ]) \ @@ -326,10 +316,6 @@ typedef struct cntx_s (dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \ ) -#define bli_cntx_get_family( cntx ) \ -\ - bli_cntx_family( cntx ) - #define bli_cntx_get_ind_method( cntx ) \ \ bli_cntx_method( cntx ) @@ -357,9 +343,9 @@ typedef struct cntx_s // create/free -//void bli_cntx_obj_create( cntx_t* cntx ); -//void bli_cntx_obj_free( cntx_t* cntx ); -void bli_cntx_obj_clear( cntx_t* cntx ); +//void bli_cntx_create( cntx_t* cntx ); +//void bli_cntx_free( cntx_t* cntx ); +void bli_cntx_clear( cntx_t* cntx ); void bli_cntx_init( cntx_t* cntx ); // get functions @@ -380,7 +366,7 @@ func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id, cntx_t* cntx ); func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id, cntx_t* cntx ); -func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); +//func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //dim_t bli_cntx_get_blksz_def_dt( num_t dt, // bszid_t bs_id, @@ -409,6 +395,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //void* bli_cntx_get_l1v_ker_dt( num_t dt, // l1vkr_t ker_id, // cntx_t* cntx ); +func_t* bli_cntx_get_packm_ker( l1mkr_t ker_id, + cntx_t* cntx ); +func_t* bli_cntx_get_unpackm_ker( l1mkr_t ker_id, + cntx_t* cntx ); //ind_t bli_cntx_get_ind_method( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ); @@ -425,18 +415,34 @@ void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ); -void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ); + +void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); + void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ); +void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, + mbool_t* prefs, + cntx_t* cntx ); + +void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx ); + void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ); + void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ); + +void bli_cntx_set_packm_kers( dim_t n_kers, ... ); + +void bli_cntx_set_packm_ker( l1mkr_t ker_id, + func_t* func, + cntx_t* cntx ); + void bli_cntx_set_packm_ukr( func_t* func, cntx_t* cntx ); void bli_cntx_set_ind_method( ind_t method, @@ -507,11 +513,11 @@ void bli_cntx_print( cntx_t* cntx ); // Preprocess out these calls entirely, since they are currently just empty // functions that do nothing. #if 0 - #define bli_cntx_obj_create( cntx ) { bli_cntx_obj_clear( cntx ); } - #define bli_cntx_obj_free( cntx ) { bli_cntx_obj_clear( cntx ); } + #define bli_cntx_create( cntx ) { bli_cntx_clear( cntx ); } + #define bli_cntx_free( cntx ) { bli_cntx_clear( cntx ); } #else - #define bli_cntx_obj_create( cntx ) { ; } - #define bli_cntx_obj_free( cntx ) { ; } + #define bli_cntx_create( cntx ) { ; } + #define bli_cntx_free( cntx ) { ; } #endif // These macros initialize/finalize a local context if the given context diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c index 75be26085..d098b4c9d 100644 --- a/frame/base/bli_func.c +++ b/frame/base/bli_func.c @@ -35,37 +35,57 @@ #include "blis.h" -func_t* bli_func_obj_create( void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ) +func_t* bli_func_create + ( + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ) { func_t* f; f = ( func_t* ) bli_malloc_intl( sizeof(func_t) ); - bli_func_obj_init( f, - ptr_s, - ptr_d, - ptr_c, - ptr_z ); + bli_func_init + ( + f, + ptr_s, + ptr_d, + ptr_c, + ptr_z + ); return f; } -void bli_func_obj_init( func_t* f, - void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ) +void bli_func_init + ( + func_t* f, + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ) { - f->ptr[BLIS_BITVAL_FLOAT_TYPE] = ptr_s; - f->ptr[BLIS_BITVAL_DOUBLE_TYPE] = ptr_d; - f->ptr[BLIS_BITVAL_SCOMPLEX_TYPE] = ptr_c; - f->ptr[BLIS_BITVAL_DCOMPLEX_TYPE] = ptr_z; + bli_func_set_dt( ptr_s, BLIS_FLOAT, f ); + bli_func_set_dt( ptr_d, BLIS_DOUBLE, f ); + bli_func_set_dt( ptr_c, BLIS_SCOMPLEX, f ); + bli_func_set_dt( ptr_z, BLIS_DCOMPLEX, f ); } -void bli_func_obj_free( func_t* f ) +void bli_func_init_null + ( + func_t* f + ) +{ + bli_func_set_dt( NULL, BLIS_FLOAT, f ); + bli_func_set_dt( NULL, BLIS_DOUBLE, f ); + bli_func_set_dt( NULL, BLIS_SCOMPLEX, f ); + bli_func_set_dt( NULL, BLIS_DCOMPLEX, f ); +} + +void bli_func_free( func_t* f ) { bli_free_intl( f ); } @@ -75,7 +95,7 @@ void bli_func_obj_free( func_t* f ) bool_t bli_func_is_null_dt( num_t dt, func_t* f ) { - return ( f->ptr[ dt ] == NULL ); + return ( bli_func_get_dt( dt, f ) == NULL ); } bool_t bli_func_is_null( func_t* f ) @@ -87,7 +107,7 @@ bool_t bli_func_is_null( func_t* f ) // return FALSE. Otherwise, if they are all null, return TRUE. for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { - if ( f->ptr[ dt ] != NULL ) + if ( bli_func_get_dt( dt, f ) != NULL ) { r_val = FALSE; break; diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h index 56b221be9..2bfc2ad20 100644 --- a/frame/base/bli_func.h +++ b/frame/base/bli_func.h @@ -49,18 +49,29 @@ // ----------------------------------------------------------------------------- -func_t* bli_func_obj_create( void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ); +func_t* bli_func_create + ( + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ); -void bli_func_obj_init( func_t* f, - void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ); +void bli_func_init + ( + func_t* f, + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ); -void bli_func_obj_free( func_t* f ); +void bli_func_init_null + ( + func_t* f + ); + +void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 2ada1556e..4d819babe 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -74,12 +74,6 @@ static blksz_t bli_gks_blkszs[BLIS_NUM_BLKSZS] = /* df */ { { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, }, { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, } }, -/* xf */ { { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, }, - { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, } - }, -/* vf */ { { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, }, - { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, } - }, }; // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_mbool.c b/frame/base/bli_mbool.c index 46ba531bc..6906622d1 100644 --- a/frame/base/bli_mbool.c +++ b/frame/base/bli_mbool.c @@ -35,29 +35,38 @@ #include "blis.h" -mbool_t* bli_mbool_obj_create( bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ) +mbool_t* bli_mbool_create + ( + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ) { mbool_t* b; b = ( mbool_t* ) bli_malloc_intl( sizeof(mbool_t) ); - bli_mbool_obj_init( b, - b_s, - b_d, - b_c, - b_z ); + bli_mbool_init + ( + b, + b_s, + b_d, + b_c, + b_z + ); return b; } -void bli_mbool_obj_init( mbool_t* b, - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ) +void bli_mbool_init + ( + mbool_t* b, + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ) { bli_mbool_set_dt( b_s, BLIS_FLOAT, b ); bli_mbool_set_dt( b_d, BLIS_DOUBLE, b ); @@ -65,7 +74,7 @@ void bli_mbool_obj_init( mbool_t* b, bli_mbool_set_dt( b_z, BLIS_DCOMPLEX, b ); } -void bli_mbool_obj_free( mbool_t* b ) +void bli_mbool_free( mbool_t* b ) { bli_free_intl( b ); } diff --git a/frame/base/bli_mbool.h b/frame/base/bli_mbool.h index 5d5f47828..181543413 100644 --- a/frame/base/bli_mbool.h +++ b/frame/base/bli_mbool.h @@ -49,16 +49,22 @@ // ----------------------------------------------------------------------------- -mbool_t* bli_mbool_obj_create( bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ); +mbool_t* bli_mbool_create + ( + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ); -void bli_mbool_obj_init( mbool_t* b, - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ); +void bli_mbool_init + ( + mbool_t* b, + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ); -void bli_mbool_obj_free( mbool_t* b ); +void bli_mbool_free( mbool_t* b ); diff --git a/frame/base/bli_membrk.c b/frame/base/bli_membrk.c index 33a998de1..210c04be1 100644 --- a/frame/base/bli_membrk.c +++ b/frame/base/bli_membrk.c @@ -44,6 +44,7 @@ void bli_membrk_init bli_mutex_init( bli_membrk_mutex( membrk ) ); bli_membrk_init_pools( cntx, membrk ); bli_membrk_set_malloc_fp( bli_malloc_pool, membrk ); + bli_membrk_set_free_fp( bli_free_pool, membrk ); } void bli_membrk_finalize diff --git a/frame/compat/f2c/bla_lsame.c b/frame/compat/f2c/bla_lsame.c index 04f8caad0..7b109ab43 100644 --- a/frame/compat/f2c/bla_lsame.c +++ b/frame/compat/f2c/bla_lsame.c @@ -41,7 +41,12 @@ -lf2c -lm (in that order) */ -bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len) + +#ifdef LAPACK_ILP64 +long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len) +#else +int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len) +#endif { /* System generated locals */ bla_logical ret_val; @@ -115,11 +120,11 @@ bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, f /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ /* upper case 'Z'. */ - if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta + if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) { inta += 64; } - if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb + if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) { intb += 64; } diff --git a/frame/compat/f2c/bla_lsame.h b/frame/compat/f2c/bla_lsame.h index 7e2f92389..e8f63f488 100644 --- a/frame/compat/f2c/bla_lsame.h +++ b/frame/compat/f2c/bla_lsame.h @@ -34,6 +34,10 @@ #ifdef BLIS_ENABLE_BLAS2BLIS -bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len); +#ifdef LAPACK_ILP64 +long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len); +#else +int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len); +#endif #endif diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 05139136b..5f54605d8 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -41,6 +41,7 @@ #include #include #include +#include // Determine if we are on a 64-bit or 32-bit architecture #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ @@ -66,6 +67,8 @@ #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) #define BLIS_OS_BSD 1 +#elif defined(EMSCRIPTEN) +#define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 1a120d5da..517a17b13 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -438,7 +438,7 @@ typedef enum BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, - BLIS_DT_HI = BLIS_DCOMPLEX, + BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum @@ -482,7 +482,7 @@ typedef enum BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R, + BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start @@ -511,7 +511,7 @@ typedef enum BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, - BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE, + BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; @@ -590,7 +590,7 @@ typedef enum BLIS_4M1B, BLIS_4M1A, BLIS_1M, - BLIS_NAT, + BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) @@ -613,7 +613,7 @@ typedef enum BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, - BLIS_XPBYV_KER, + BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 @@ -625,19 +625,93 @@ typedef enum BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, - BLIS_DOTXAXPYF_KER, + BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 +typedef enum +{ + BLIS_PACKM_0XK_KER = 0, + BLIS_PACKM_1XK_KER = 1, + BLIS_PACKM_2XK_KER = 2, + BLIS_PACKM_3XK_KER = 3, + BLIS_PACKM_4XK_KER = 4, + BLIS_PACKM_5XK_KER = 5, + BLIS_PACKM_6XK_KER = 6, + BLIS_PACKM_7XK_KER = 7, + BLIS_PACKM_8XK_KER = 8, + BLIS_PACKM_9XK_KER = 9, + BLIS_PACKM_10XK_KER = 10, + BLIS_PACKM_11XK_KER = 11, + BLIS_PACKM_12XK_KER = 12, + BLIS_PACKM_13XK_KER = 13, + BLIS_PACKM_14XK_KER = 14, + BLIS_PACKM_15XK_KER = 15, + BLIS_PACKM_16XK_KER = 16, + BLIS_PACKM_17XK_KER = 17, + BLIS_PACKM_18XK_KER = 18, + BLIS_PACKM_19XK_KER = 19, + BLIS_PACKM_20XK_KER = 20, + BLIS_PACKM_21XK_KER = 21, + BLIS_PACKM_22XK_KER = 22, + BLIS_PACKM_23XK_KER = 23, + BLIS_PACKM_24XK_KER = 24, + BLIS_PACKM_25XK_KER = 25, + BLIS_PACKM_26XK_KER = 26, + BLIS_PACKM_27XK_KER = 27, + BLIS_PACKM_28XK_KER = 28, + BLIS_PACKM_29XK_KER = 29, + BLIS_PACKM_30XK_KER = 30, + BLIS_PACKM_31XK_KER = 31, + + BLIS_UNPACKM_0XK_KER = 0, + BLIS_UNPACKM_1XK_KER = 1, + BLIS_UNPACKM_2XK_KER = 2, + BLIS_UNPACKM_3XK_KER = 3, + BLIS_UNPACKM_4XK_KER = 4, + BLIS_UNPACKM_5XK_KER = 5, + BLIS_UNPACKM_6XK_KER = 6, + BLIS_UNPACKM_7XK_KER = 7, + BLIS_UNPACKM_8XK_KER = 8, + BLIS_UNPACKM_9XK_KER = 9, + BLIS_UNPACKM_10XK_KER = 10, + BLIS_UNPACKM_11XK_KER = 11, + BLIS_UNPACKM_12XK_KER = 12, + BLIS_UNPACKM_13XK_KER = 13, + BLIS_UNPACKM_14XK_KER = 14, + BLIS_UNPACKM_15XK_KER = 15, + BLIS_UNPACKM_16XK_KER = 16, + BLIS_UNPACKM_17XK_KER = 17, + BLIS_UNPACKM_18XK_KER = 18, + BLIS_UNPACKM_19XK_KER = 19, + BLIS_UNPACKM_20XK_KER = 20, + BLIS_UNPACKM_21XK_KER = 21, + BLIS_UNPACKM_22XK_KER = 22, + BLIS_UNPACKM_23XK_KER = 23, + BLIS_UNPACKM_24XK_KER = 24, + BLIS_UNPACKM_25XK_KER = 25, + BLIS_UNPACKM_26XK_KER = 26, + BLIS_UNPACKM_27XK_KER = 27, + BLIS_UNPACKM_28XK_KER = 28, + BLIS_UNPACKM_29XK_KER = 29, + BLIS_UNPACKM_30XK_KER = 30, + BLIS_UNPACKM_31XK_KER = 31 + +} l1mkr_t; + +#define BLIS_NUM_PACKM_KERS 32 +#define BLIS_NUM_UNPACKM_KERS 32 + + typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR, + BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 @@ -648,7 +722,7 @@ typedef enum BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, - BLIS_NOTAPPLIC_UKERNEL, + BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 @@ -662,7 +736,7 @@ typedef enum BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, - BLIS_PR_IDX, + BLIS_PR_IDX } thridx_t; #endif @@ -683,7 +757,7 @@ typedef enum // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in -// bli_ind_query.c to index into arrays. +// bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_HEMM, @@ -696,7 +770,7 @@ typedef enum BLIS_TRMM, BLIS_TRSM, - BLIS_NOID, + BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 10 @@ -714,16 +788,14 @@ typedef enum BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension - BLIS_1F, // level-1f global fusing factor BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor - BLIS_VF, // level-1v vector fusing factor - BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. + BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; -#define BLIS_NUM_BLKSZS 13 +#define BLIS_NUM_BLKSZS 11 // @@ -784,6 +856,7 @@ typedef struct mem_s struct cntl_s { // Basic fields (usually required). + opid_t family; bszid_t bszid; void* var_func; struct cntl_s* sub_node; @@ -971,9 +1044,9 @@ typedef struct cntx_s func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; - func_t packm_ukrs; + func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; + func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; - opid_t family; ind_t method; pack_t schema_a_block; pack_t schema_b_panel; @@ -992,7 +1065,7 @@ typedef struct cntx_s typedef enum { BLIS_NO_ERROR_CHECKING = 0, - BLIS_FULL_ERROR_CHECKING, + BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index 5b7a70c3c..03a4d4d91 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -122,7 +122,7 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M1; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -170,7 +170,7 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M2; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -218,7 +218,7 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M3; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -279,7 +279,7 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3MH; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -343,7 +343,7 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1A; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -391,7 +391,7 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1B; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -439,7 +439,7 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4MH; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -524,7 +524,7 @@ void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ) const ind_t method = BLIS_1M; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 96f9add60..d3127b81f 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -41,7 +41,7 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M1; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -89,7 +89,7 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1A; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -137,7 +137,7 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_1M; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index f45827efd..a06f49523 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -36,19 +36,82 @@ void* bli_thrcomm_bcast ( - thrcomm_t* communicator, + thrcomm_t* comm, dim_t id, void* to_send ) { - if ( communicator == NULL || communicator->n_threads == 1 ) return to_send; + if ( comm == NULL || comm->n_threads == 1 ) return to_send; - if ( id == 0 ) communicator->sent_object = to_send; + if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( communicator, id ); - void* object = communicator->sent_object; - bli_thrcomm_barrier( communicator, id ); + bli_thrcomm_barrier( comm, id ); + void* object = comm->sent_object; + bli_thrcomm_barrier( comm, id ); return object; } +// Use __sync_* builtins (assumed available) if __atomic_* ones are not present. +#ifndef __ATOMIC_RELAXED + +#define __ATOMIC_RELAXED +#define __ATOMIC_ACQUIRE +#define __ATOMIC_RELEASE +#define __ATOMIC_ACQ_REL + +#define __atomic_load_n(ptr, constraint) \ + __sync_fetch_and_add(ptr, 0) +#define __atomic_add_fetch(ptr, value, constraint) \ + __sync_add_and_fetch(ptr, value) +#define __atomic_fetch_add(ptr, value, constraint) \ + __sync_fetch_and_add(ptr, value) +#define __atomic_fetch_xor(ptr, value, constraint) \ + __sync_fetch_and_xor(ptr, value) + +#endif + +void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) +{ + // Return early if the comm is NULL or if there is only one + // thread participating. + if ( comm == NULL || comm->n_threads == 1 ) return; + + // Read the "sense" variable. This variable is akin to a unique ID for + // the current barrier. The first n-1 threads will spin on this variable + // until it changes. The sense variable gets incremented by the last + // thread to enter the barrier, just before it exits. But it turns out + // that you don't need many unique IDs before you can wrap around. In + // fact, if everything else is working, a binary variable is sufficient, + // which is what we do here (i.e., 0 is incremented to 1, which is then + // decremented back to 0, and so forth). + bool_t orig_sense = __atomic_load_n( &comm->barrier_sense, __ATOMIC_RELAXED ); + + // Register ourselves (the current thread) as having arrived by + // incrementing the barrier_threads_arrived variable. We must perform + // this increment (and a subsequent read) atomically. + dim_t my_threads_arrived = + __atomic_add_fetch( &comm->barrier_threads_arrived, 1, __ATOMIC_ACQ_REL ); + + // If the current thread was the last thread to have arrived, then + // it will take actions that effectively ends and resets the barrier. + if ( my_threads_arrived == comm->n_threads ) + { + // Reset the variable tracking the number of threads that have arrived + // to zero (which returns the barrier to the "empty" state. Then + // atomically toggle the barrier sense variable. This will signal to + // the other threads (which are spinning in the branch elow) that it + // is now safe to exit the barrier. + comm->barrier_threads_arrived = 0; + __atomic_fetch_xor( &comm->barrier_sense, 1, __ATOMIC_RELEASE ); + } + else + { + // If the current thread is NOT the last thread to have arrived, then + // it spins on the sense variable until that sense variable changes at + // which time these threads will exit the barrier. + while ( __atomic_load_n( &comm->barrier_sense, __ATOMIC_ACQUIRE ) == orig_sense ) + ; // Empty loop body. + } +} + diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 593f8d7fa..59fbc6576 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -49,11 +49,13 @@ // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( dim_t n_threads ); -void bli_thrcomm_free( thrcomm_t* communicator ); -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads ); -void bli_thrcomm_cleanup( thrcomm_t* communicator ); -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t thread_id ); -void* bli_thrcomm_bcast( thrcomm_t* communicator, dim_t inside_id, void* to_send ); +void bli_thrcomm_free( thrcomm_t* comm ); +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ); +void bli_thrcomm_cleanup( thrcomm_t* comm ); +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t thread_id ); +void* bli_thrcomm_bcast( thrcomm_t* comm, dim_t inside_id, void* to_send ); + +void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ); #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 0882d1659..131f70973 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -44,63 +44,66 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } #ifndef BLIS_TREE_BARRIER -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barrier_sense = 0; - communicator->barrier_threads_arrived = 0; + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; } //'Normal' barrier for openmp //barrier routine taken from art of multicore programming -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - if( communicator == NULL || communicator->n_threads == 1 ) +#if 0 + if ( comm == NULL || comm->n_threads == 1 ) return; - bool_t my_sense = communicator->barrier_sense; + bool_t my_sense = comm->barrier_sense; dim_t my_threads_arrived; _Pragma( "omp atomic capture" ) - my_threads_arrived = ++(communicator->barrier_threads_arrived); + my_threads_arrived = ++(comm->barrier_threads_arrived); - if ( my_threads_arrived == communicator->n_threads ) + if ( my_threads_arrived == comm->n_threads ) { - communicator->barrier_threads_arrived = 0; - communicator->barrier_sense = !communicator->barrier_sense; + comm->barrier_threads_arrived = 0; + comm->barrier_sense = !comm->barrier_sense; } else { - volatile bool_t* listener = &communicator->barrier_sense; + volatile bool_t* listener = &comm->barrier_sense; while ( *listener == my_sense ) {} } +#endif + bli_thrcomm_barrier_atomic( comm, t_id ); } #else -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads ); - bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 ); + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads ); + bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 ); } //Tree barrier used for Intel Xeon Phi @@ -145,14 +148,14 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_ return me; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - for ( dim_t i = 0; i < communicator->n_threads; i++ ) + if ( comm == NULL ) return; + for ( dim_t i = 0; i < comm->n_threads; i++ ) { - bli_thrcomm_tree_barrier_free( communicator->barriers[i] ); + bli_thrcomm_tree_barrier_free( comm->barriers[i] ); } - bli_free_intl( communicator->barriers ); + bli_free_intl( comm->barriers ); } void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) @@ -204,6 +207,7 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -231,7 +235,7 @@ void bli_l3_thread_decorator thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -249,7 +253,7 @@ void bli_l3_thread_decorator ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); #ifdef PRINT_THRINFO threads[id] = thread; diff --git a/frame/thread/bli_thrcomm_openmp.h b/frame/thread/bli_thrcomm_openmp.h index 6808b9772..435845b16 100644 --- a/frame/thread/bli_thrcomm_openmp.h +++ b/frame/thread/bli_thrcomm_openmp.h @@ -60,11 +60,12 @@ struct thrcomm_s #else struct thrcomm_s { - void* sent_object; - dim_t n_threads; + void* sent_object; + dim_t n_threads; - volatile bool_t barrier_sense; - dim_t barrier_threads_arrived; + //volatile bool_t barrier_sense; + bool_t barrier_sense; + dim_t barrier_threads_arrived; }; #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 230b63905..540e161c8 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -43,81 +43,84 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } #ifdef BLIS_USE_PTHREAD_BARRIER -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - pthread_barrier_init( &communicator->barrier, NULL, n_threads ); + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + pthread_barrier_init( &comm->barrier, NULL, n_threads ); } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - pthread_barrier_destroy( &communicator->barrier ); + if ( comm == NULL ) return; + pthread_barrier_destroy( &comm->barrier ); } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - pthread_barrier_wait( &communicator->barrier ); + pthread_barrier_wait( &comm->barrier ); } #else -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->sense = 0; - communicator->threads_arrived = 0; + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; -#ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_init( &communicator->mutex, NULL ); -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// pthread_mutex_init( &comm->mutex, NULL ); +//#endif } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { -#ifdef BLIS_USE_PTHREAD_MUTEX - if ( communicator == NULL ) return; - pthread_mutex_destroy( &communicator->mutex ); -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// if ( comm == NULL ) return; +// pthread_mutex_destroy( &comm->mutex ); +//#endif } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - if ( communicator == NULL || communicator->n_threads == 1 ) return; - bool_t my_sense = communicator->sense; +#if 0 + if ( comm == NULL || comm->n_threads == 1 ) return; + bool_t my_sense = comm->sense; dim_t my_threads_arrived; #ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_lock( &communicator->mutex ); - my_threads_arrived = ++(communicator->threads_arrived); - pthread_mutex_unlock( &communicator->mutex ); + pthread_mutex_lock( &comm->mutex ); + my_threads_arrived = ++(comm->threads_arrived); + pthread_mutex_unlock( &comm->mutex ); #else - my_threads_arrived = __sync_add_and_fetch(&(communicator->threads_arrived), 1); + my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1); #endif - if ( my_threads_arrived == communicator->n_threads ) + if ( my_threads_arrived == comm->n_threads ) { - communicator->threads_arrived = 0; - communicator->sense = !communicator->sense; + comm->threads_arrived = 0; + comm->sense = !comm->sense; } else { - volatile bool_t* listener = &communicator->sense; + volatile bool_t* listener = &comm->sense; while( *listener == my_sense ) {} } +#endif + bli_thrcomm_barrier_atomic( comm, t_id ); } #endif @@ -129,6 +132,7 @@ void* bli_l3_thread_entry( void* data_void ); typedef struct thread_data { l3int_t func; + opid_t family; obj_t* alpha; obj_t* a; obj_t* b; @@ -145,6 +149,7 @@ void* bli_l3_thread_entry( void* data_void ) { thread_data_t* data = data_void; + opid_t family = data->family; obj_t* alpha = data->alpha; obj_t* a = data->a; obj_t* b = data->b; @@ -159,7 +164,7 @@ void* bli_l3_thread_entry( void* data_void ) thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -177,7 +182,7 @@ void* bli_l3_thread_entry( void* data_void ) ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); @@ -188,6 +193,7 @@ void* bli_l3_thread_entry( void* data_void ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -214,6 +220,7 @@ void bli_l3_thread_decorator { // Set up thread data for additional threads (beyond thread 0). datas[id].func = func; + datas[id].family = family; datas[id].alpha = alpha; datas[id].a = a; datas[id].b = b; diff --git a/frame/thread/bli_thrcomm_pthreads.h b/frame/thread/bli_thrcomm_pthreads.h index 1c807772d..286387bcf 100644 --- a/frame/thread/bli_thrcomm_pthreads.h +++ b/frame/thread/bli_thrcomm_pthreads.h @@ -54,12 +54,13 @@ struct thrcomm_s void* sent_object; dim_t n_threads; -#ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_t mutex; -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// pthread_mutex_t mutex; +//#endif - volatile bool_t sense; - volatile dim_t threads_arrived; + //volatile bool_t barrier_sense; + bool_t barrier_sense; + dim_t barrier_threads_arrived; }; #endif diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index c038f59a0..cb0bc2ae4 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -44,29 +44,29 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads ) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barrier_sense = 0; - communicator->barrier_threads_arrived = 0; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { return; } @@ -74,6 +74,7 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -94,7 +95,7 @@ void bli_l3_thread_decorator thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -112,7 +113,7 @@ void bli_l3_thread_decorator ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 37ec94292..2d150c656 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -218,9 +218,10 @@ siz_t bli_thread_get_range_l2r dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, n, bf, FALSE, start, end ); @@ -237,9 +238,10 @@ siz_t bli_thread_get_range_r2l dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, n, bf, TRUE, start, end ); @@ -256,9 +258,10 @@ siz_t bli_thread_get_range_t2b dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, m, bf, FALSE, start, end ); @@ -275,9 +278,10 @@ siz_t bli_thread_get_range_b2t dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, m, bf, TRUE, start, end ); @@ -649,7 +653,7 @@ siz_t bli_thread_get_range_mdim ) { bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -708,7 +712,7 @@ siz_t bli_thread_get_range_ndim ) { bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -771,11 +775,12 @@ siz_t bli_thread_get_range_weighted_l2r if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -820,11 +825,12 @@ siz_t bli_thread_get_range_weighted_r2l if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -871,11 +877,12 @@ siz_t bli_thread_get_range_weighted_t2b if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -922,11 +929,12 @@ siz_t bli_thread_get_range_weighted_b2t if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -1156,19 +1164,112 @@ void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2, // ----------------------------------------------------------------------------- -// Some utilities -dim_t bli_env_read_nway( const char* env, dim_t fallback ) +dim_t bli_thread_get_env( const char* env, dim_t fallback ) { - dim_t num = fallback; - char* str = getenv( env ); + dim_t r_val; + char* str; + // Query the environment variable and store the result in str. + str = getenv( env ); + + // Set the return value based on the string obtained from getenv(). if ( str != NULL ) - { - num = strtol( str, NULL, 10 ); - } - return num; + { + // If there was no error, convert the string to an integer and + // prepare to return that integer. + r_val = strtol( str, NULL, 10 ); + } + else + { + // If there was an error, use the "fallback" as the return value. + r_val = fallback; + } + + return r_val; } +dim_t bli_thread_get_jc_nt( void ) +{ + return bli_thread_get_env( "BLIS_JC_NT", 1 ); +} + +dim_t bli_thread_get_ic_nt( void ) +{ + return bli_thread_get_env( "BLIS_IC_NT", 1 ); +} + +dim_t bli_thread_get_jr_nt( void ) +{ + return bli_thread_get_env( "BLIS_JR_NT", 1 ); +} + +dim_t bli_thread_get_ir_nt( void ) +{ + return bli_thread_get_env( "BLIS_IR_NT", 1 ); +} + +dim_t bli_thread_get_num_threads( void ) +{ + return bli_thread_get_env( "BLIS_NUM_THREADS", 1 ); +} + +void bli_thread_set_env( const char* env, dim_t value ) +{ + dim_t r_val; + char value_str[32]; + const char* fs_32 = "%u"; + const char* fs_64 = "%lu"; + + // Convert the string to an integer, but vary the format specifier + // depending on the integer type size. + if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value ); + else sprintf( value_str, fs_64, value ); + + // Set the environment variable using the string we just wrote to via + // sprintf(). (The 'TRUE' argument means we want to overwrite the current + // value if the environment variable already exists.) + r_val = setenv( env, value_str, TRUE ); + + // Check the return value in case something went horribly wrong. + if ( r_val == -1 ) + { + char err_str[128]; + + // Query the human-readable error string corresponding to errno. + strerror_r( errno, err_str, 128 ); + + // Print the error message. + bli_print_msg( err_str, __FILE__, __LINE__ ); + } +} + +void bli_thread_set_jc_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_JC_NT", value ); +} + +void bli_thread_set_ic_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_IC_NT", value ); +} + +void bli_thread_set_jr_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_JR_NT", value ); +} + +void bli_thread_set_ir_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_IR_NT", value ); +} + +void bli_thread_set_num_threads( dim_t value ) +{ + bli_thread_set_env( "BLIS_NUM_THREADS", value ); +} + +// ----------------------------------------------------------------------------- + dim_t bli_gcd( dim_t x, dim_t y ) { while ( y != 0 ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 1998253cf..a88d24bc0 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -155,6 +155,7 @@ typedef void (*l3int_t) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -164,6 +165,8 @@ void bli_l3_thread_decorator cntl_t* cntl ); +// ----------------------------------------------------------------------------- + // Factorization and partitioning prototypes typedef struct { @@ -178,8 +181,26 @@ dim_t bli_next_prime_factor(bli_prime_factors_t* factors); void bli_partition_2x2(dim_t nthread, dim_t work1, dim_t work2, dim_t* nt1, dim_t* nt2); -// Miscellaneous prototypes -dim_t bli_env_read_nway( const char* env, dim_t fallback ); +// ----------------------------------------------------------------------------- + +dim_t bli_thread_get_env( const char* env, dim_t fallback ); + +dim_t bli_thread_get_jc_nt( void ); +dim_t bli_thread_get_ic_nt( void ); +dim_t bli_thread_get_jr_nt( void ); +dim_t bli_thread_get_ir_nt( void ); +dim_t bli_thread_get_num_threads( void ); + +void bli_thread_set_env( const char* env, dim_t value ); + +void bli_thread_set_jc_nt( dim_t value ); +void bli_thread_set_ic_nt( dim_t value ); +void bli_thread_set_jr_nt( dim_t value ); +void bli_thread_set_ir_nt( dim_t value ); +void bli_thread_set_num_threads( dim_t value ); + +// ----------------------------------------------------------------------------- + dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c index 5bc2dd4ba..2088e030a 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c @@ -634,6 +634,8 @@ void bli_sgemm_asm_24x4 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1253,6 +1255,8 @@ void bli_dgemm_asm_12x4 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c index c92612b07..5eb0f0732 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c @@ -600,6 +600,8 @@ void bli_sgemm_asm_4x24 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1186,6 +1188,8 @@ void bli_dgemm_asm_4x12 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bee1df996..78b294053 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -595,6 +595,8 @@ void bli_sgemm_asm_6x16 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -734,6 +736,8 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 + "prefetcht0 72 * 8(%%rax) \n\t" + " \n\t" "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" @@ -759,7 +763,7 @@ void bli_dgemm_asm_6x8 "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" + "prefetcht0 80 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" @@ -1173,6 +1177,8 @@ void bli_dgemm_asm_6x8 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1700,6 +1706,8 @@ void bli_cgemm_asm_3x8 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2226,6 +2234,8 @@ void bli_zgemm_asm_3x4 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c index f173947c3..9796e27ef 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c @@ -596,6 +596,8 @@ void bli_sgemm_asm_16x6 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1176,6 +1178,8 @@ void bli_dgemm_asm_8x6 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1703,6 +1707,8 @@ void bli_cgemm_asm_8x3 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2229,6 +2235,8 @@ void bli_zgemm_asm_4x3 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c index f8db398ca..f19f053fc 100644 --- a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c +++ b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c @@ -991,7 +991,9 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" ".SDONE: \n\t" - " \n\t" + " \n\t" + "vzeroupper \n\t" + " \n\t" : // output operands (none) : // input operands @@ -1658,6 +1660,8 @@ void bli_dgemm_asm_8x4 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2611,6 +2615,8 @@ void bli_cgemm_asm_8x4 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -3453,6 +3459,8 @@ void bli_zgemm_asm_4x4 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 993c134b4..84552b569 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -1903,7 +1903,7 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia else does_inv_diag = TRUE; // Create a control tree node for the packing operation. - cntl_t* cntl = bli_packm_cntl_obj_create + cntl_t* cntl = bli_packm_cntl_create_node ( NULL, // func ptr is not referenced b/c we don't call via l3 _int(). bli_packm_blk_var1,