mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
resolving conflicts bli_gemm_front.c and LICENCE
Change-Id: Id24ce53896d4c1c7ceccc3e004014a0ecceb5474
This commit is contained in:
2
LICENSE
2
LICENSE
@@ -1,8 +1,6 @@
|
||||
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
4
Makefile
4
Makefile
@@ -85,9 +85,6 @@ TESTSUITE_CONF_GEN := input.general
|
||||
TESTSUITE_CONF_OPS := input.operations
|
||||
TESTSUITE_OUT_FILE := output.testsuite
|
||||
|
||||
# The name of the file where the version string is stored.
|
||||
VERSION_FILE := version
|
||||
|
||||
# The name of the "special" directories, which contain source code that
|
||||
# use non-standard compiler flags.
|
||||
NOOPT_DIR := noopt
|
||||
@@ -141,7 +138,6 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME)
|
||||
|
||||
# Construct the architecture-version string, which will be used to name the
|
||||
# library upon installation.
|
||||
VERSION := $(shell cat $(DIST_PATH)/$(VERSION_FILE))
|
||||
VERS_CONF := $(VERSION)-$(CONFIG_NAME)
|
||||
|
||||
# --- Library names ---
|
||||
|
||||
@@ -313,7 +313,7 @@ This project and its associated research was partially sponsored by grants from
|
||||
[Microsoft](http://www.microsoft.com/), [Intel](http://www.intel.com/), [Texas
|
||||
Instruments](http://www.ti.com/), and [AMD](http://www.amd.com/), as well as
|
||||
grants from the [National Science Foundation](http://www.nsf.gov/) (Awards
|
||||
CCF-0917167 ACI-1148125/1340293, and CCF-1320112).
|
||||
CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493).
|
||||
|
||||
_Any opinions, findings and conclusions or recommendations expressed in this
|
||||
material are those of the author(s) and do not necessarily reflect the views of
|
||||
|
||||
@@ -36,6 +36,10 @@
|
||||
ifndef CONFIG_MK_INCLUDED
|
||||
CONFIG_MK_INCLUDED := yes
|
||||
|
||||
# The version string. This could be the official string or a custom
|
||||
# string forced at configure-time.
|
||||
VERSION := @version@
|
||||
|
||||
# The name of the configuration sub-directory.
|
||||
CONFIG_NAME := @config_name@
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -59,7 +59,7 @@ CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -83,7 +83,7 @@ endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -58,7 +58,8 @@ CVECFLAGS :=
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := emar
|
||||
ARFLAGS := cru
|
||||
RANLIB := emranlib
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -83,7 +83,7 @@ endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -94,7 +94,7 @@ endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -81,7 +81,7 @@ endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -83,7 +83,7 @@ endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
|
||||
36
configure
vendored
36
configure
vendored
@@ -123,6 +123,12 @@ print_usage()
|
||||
echo " compatibility layer. This automatically enables the"
|
||||
echo " BLAS compatibility layer as well."
|
||||
echo " "
|
||||
echo " --force-version=STRING"
|
||||
echo " "
|
||||
echo " Force configure to use an arbitrary version string"
|
||||
echo " STRING. This option may be useful when repackaging"
|
||||
echo " custom versions of BLIS by outside organizations."
|
||||
echo " "
|
||||
echo " -h, --help Output this information and quit."
|
||||
echo " "
|
||||
echo " Environment Variables:"
|
||||
@@ -232,6 +238,7 @@ main()
|
||||
blas2blis_int_type_size=32
|
||||
enable_blas2blis='yes'
|
||||
enable_cblas='no'
|
||||
force_version='no'
|
||||
|
||||
# The path to the auto-detection script.
|
||||
auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh"
|
||||
@@ -247,14 +254,6 @@ main()
|
||||
dummy_file='_blis_dir_detect.tmp'
|
||||
|
||||
|
||||
# Check whether we need to update the version file.
|
||||
${update_version_file_sh} -o "${script_name}" "${version_filepath}"
|
||||
|
||||
|
||||
# Query which version of BLIS this is.
|
||||
version=$(cat ${version_filepath})
|
||||
|
||||
|
||||
# Process our command line options.
|
||||
while getopts ":hp:d:t:qi:b:-:" opt; do
|
||||
case $opt in
|
||||
@@ -323,6 +322,9 @@ main()
|
||||
disable-cblas)
|
||||
enable_cblas='no'
|
||||
;;
|
||||
force-version=*)
|
||||
force_version=${OPTARG#*=}
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
@@ -375,10 +377,27 @@ main()
|
||||
done
|
||||
|
||||
|
||||
# Check whether we need to update the version file.
|
||||
${update_version_file_sh} -o "${script_name}" "${version_filepath}"
|
||||
|
||||
|
||||
# Query which version of BLIS this is.
|
||||
version=$(cat ${version_filepath})
|
||||
|
||||
|
||||
# Initial message.
|
||||
echo "${script_name}: starting configuration of BLIS ${version}."
|
||||
|
||||
|
||||
# Check if the user requested a custom version string.
|
||||
if [ "x${force_version}" = "xno" ]; then
|
||||
echo "${script_name}: configuring with official version string."
|
||||
else
|
||||
echo "${script_name}: configuring with custom version string '${force_version}'."
|
||||
version="${force_version}"
|
||||
fi
|
||||
|
||||
|
||||
# Set config_name based on the number of arguments leftover (after command
|
||||
# line option processing).
|
||||
if [ $# = "0" ]; then
|
||||
@@ -574,6 +593,7 @@ main()
|
||||
# to config_mk_out.
|
||||
echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}"
|
||||
cat "${config_mk_in_path}" \
|
||||
| sed "s/@version@/${version}/g" \
|
||||
| sed "s/@config_name@/${config_name}/g" \
|
||||
| sed "s/@dist_path@/${dist_path_esc}/g" \
|
||||
| sed "s/@CC@/${cc_esc}/g" \
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with the kernel associated with the current
|
||||
operation. */ \
|
||||
@@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( addv, BLIS_ADDV_KER )
|
||||
@@ -70,7 +70,7 @@ GENFRONT( swapv, BLIS_SWAPV_KER )
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
|
||||
@@ -84,7 +84,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv )
|
||||
@@ -95,7 +95,7 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv )
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
@@ -106,7 +106,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( axpyv, BLIS_AXPYV_KER, addv )
|
||||
@@ -118,7 +118,7 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv )
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
|
||||
@@ -130,7 +130,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( scal2v, BLIS_SCAL2V_KER, setv, copyv )
|
||||
|
||||
@@ -47,7 +47,7 @@ void bli_packv_cntl_init( void )
|
||||
|
||||
void bli_packv_cntl_finalize( void )
|
||||
{
|
||||
bli_cntl_obj_free( packv_cntl );
|
||||
bli_cntl_free_node( packv_cntl );
|
||||
}
|
||||
|
||||
packv_t* bli_packv_cntl_obj_create( impl_t impl_type,
|
||||
@@ -105,7 +105,7 @@ cntl_t* bli_packv_cntl_obj_create
|
||||
// that no blocksize partitioning is performed. bli_cntl_free() will rely
|
||||
// on this information to know how to step through the thrinfo_t tree in
|
||||
// sync with the cntl_t tree.
|
||||
cntl = bli_cntl_obj_create
|
||||
cntl = bli_cntl_create_node
|
||||
(
|
||||
BLIS_NO_PART,
|
||||
var_func,
|
||||
|
||||
@@ -44,7 +44,7 @@ void bli_scalv_cntl_init()
|
||||
|
||||
void bli_scalv_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( scalv_cntl );
|
||||
bli_cntl_free_node( scalv_cntl );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ void bli_unpackv_cntl_init()
|
||||
|
||||
void bli_unpackv_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( unpackv_cntl );
|
||||
bli_cntl_free_node( unpackv_cntl );
|
||||
}
|
||||
|
||||
unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type,
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
@@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( addd, addv )
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
@@ -54,7 +54,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv )
|
||||
@@ -65,7 +65,7 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv )
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
|
||||
@@ -77,7 +77,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv )
|
||||
@@ -88,7 +88,7 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv )
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
@@ -105,7 +105,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv )
|
||||
@@ -116,7 +116,7 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv )
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
|
||||
@@ -135,7 +135,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
} \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( dotxf, BLIS_DOTXF_KER, dotv, dotxv )
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
|
||||
@@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( addm, addv )
|
||||
@@ -66,7 +66,7 @@ GENFRONT( subm, subv )
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernel dependencies. */ \
|
||||
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
|
||||
@@ -75,7 +75,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
\
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( copym, copyv, setv )
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
cntl_t* bli_packm_cntl_obj_create
|
||||
cntl_t* bli_packm_cntl_create_node
|
||||
(
|
||||
void* var_func,
|
||||
void* packm_var_func,
|
||||
@@ -69,8 +69,9 @@ cntl_t* bli_packm_cntl_obj_create
|
||||
// that no blocksize partitioning is performed. bli_cntl_free() will rely
|
||||
// on this information to know how to step through the thrinfo_t tree in
|
||||
// sync with the cntl_t tree.
|
||||
cntl = bli_cntl_obj_create
|
||||
cntl = bli_cntl_create_node
|
||||
(
|
||||
BLIS_NOID,
|
||||
BLIS_NO_PART,
|
||||
var_func,
|
||||
params,
|
||||
|
||||
@@ -80,7 +80,7 @@ typedef struct packm_params_s packm_params_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_packm_cntl_obj_create
|
||||
cntl_t* bli_packm_cntl_create_node
|
||||
(
|
||||
void* var_func,
|
||||
void* packm_var_func,
|
||||
|
||||
@@ -41,7 +41,7 @@
|
||||
|
||||
void bli_packm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_obj_create( cntx );
|
||||
bli_cntx_create( cntx );
|
||||
|
||||
// Initialize the context with kernels that may be needed for the
|
||||
// current operation.
|
||||
@@ -57,5 +57,5 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
|
||||
void bli_packm_cntx_finalize( cntx_t* cntx )
|
||||
{
|
||||
bli_cntx_obj_free( cntx );
|
||||
bli_cntx_free( cntx );
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
cntl_t* bli_scalm_cntl_obj_create
|
||||
cntl_t* bli_scalm_cntl_create_node
|
||||
(
|
||||
void* var_func,
|
||||
cntl_t* sub_node
|
||||
@@ -46,8 +46,9 @@ cntl_t* bli_scalm_cntl_obj_create
|
||||
// that no blocksize partitioning is performed. bli_cntl_free() will rely
|
||||
// on this information to know how to step through the thrinfo_t tree in
|
||||
// sync with the cntl_t tree.
|
||||
cntl = bli_cntl_obj_create
|
||||
cntl = bli_cntl_create_node
|
||||
(
|
||||
BLIS_NOID,
|
||||
BLIS_NO_PART,
|
||||
var_func,
|
||||
NULL,
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
*/
|
||||
|
||||
|
||||
cntl_t* bli_scalm_cntl_obj_create
|
||||
cntl_t* bli_scalm_cntl_create_node
|
||||
(
|
||||
void* var_func,
|
||||
cntl_t* sub_node
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
cntl_t* bli_unpackm_cntl_obj_create
|
||||
cntl_t* bli_unpackm_cntl_create_node
|
||||
(
|
||||
void* var_func,
|
||||
void* unpackm_var_func,
|
||||
@@ -55,8 +55,9 @@ cntl_t* bli_unpackm_cntl_obj_create
|
||||
// that no blocksize partitioning is performed. bli_cntl_free() will rely
|
||||
// on this information to know how to step through the thrinfo_t tree in
|
||||
// sync with the cntl_t tree.
|
||||
cntl = bli_cntl_obj_create
|
||||
cntl = bli_cntl_create_node
|
||||
(
|
||||
BLIS_NOID,
|
||||
BLIS_NO_PART,
|
||||
var_func,
|
||||
params,
|
||||
|
||||
@@ -45,7 +45,7 @@ typedef struct unpackm_params_s unpackm_params_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_unpackm_cntl_obj_create
|
||||
cntl_t* bli_unpackm_cntl_create_node
|
||||
(
|
||||
void* var_func,
|
||||
void* unpackm_var_func,
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernels employed by the current
|
||||
operation. */ \
|
||||
@@ -127,7 +127,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Free the context and all memory allocated to it. */ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
GENFRONT( trmv )
|
||||
GENFRONT( trsv )
|
||||
@@ -139,7 +139,7 @@ GENFRONT( trsv )
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernels employed by the current
|
||||
operation. */ \
|
||||
@@ -159,7 +159,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Free the context and all memory allocated to it. */ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( ger )
|
||||
@@ -173,7 +173,7 @@ GENFRONT( syr )
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernels employed by the current
|
||||
operation. */ \
|
||||
@@ -211,7 +211,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Free the context and all memory allocated to it. */ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( hemv )
|
||||
@@ -224,7 +224,7 @@ GENFRONT( symv )
|
||||
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Perform basic setup on the context. */ \
|
||||
bli_cntx_obj_create( cntx ); \
|
||||
bli_cntx_create( cntx ); \
|
||||
\
|
||||
/* Initialize the context with kernels employed by the current
|
||||
operation. */ \
|
||||
@@ -246,7 +246,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
|
||||
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
|
||||
{ \
|
||||
/* Free the context and all memory allocated to it. */ \
|
||||
bli_cntx_obj_free( cntx ); \
|
||||
bli_cntx_free( cntx ); \
|
||||
}
|
||||
|
||||
GENFRONT( her2 )
|
||||
|
||||
@@ -152,17 +152,17 @@ void bli_gemv_cntl_init()
|
||||
|
||||
void bli_gemv_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( gemv_cntl_bs_ke_dot );
|
||||
bli_cntl_obj_free( gemv_cntl_bs_ke_axpy );
|
||||
bli_cntl_free_node( gemv_cntl_bs_ke_dot );
|
||||
bli_cntl_free_node( gemv_cntl_bs_ke_axpy );
|
||||
|
||||
bli_cntl_obj_free( gemv_cntl_rp_bs_dot );
|
||||
bli_cntl_obj_free( gemv_cntl_rp_bs_axpy );
|
||||
bli_cntl_free_node( gemv_cntl_rp_bs_dot );
|
||||
bli_cntl_free_node( gemv_cntl_rp_bs_axpy );
|
||||
|
||||
bli_cntl_obj_free( gemv_cntl_cp_bs_dot );
|
||||
bli_cntl_obj_free( gemv_cntl_cp_bs_axpy );
|
||||
bli_cntl_free_node( gemv_cntl_cp_bs_dot );
|
||||
bli_cntl_free_node( gemv_cntl_cp_bs_axpy );
|
||||
|
||||
bli_cntl_obj_free( gemv_cntl_ge_dot );
|
||||
bli_cntl_obj_free( gemv_cntl_ge_axpy );
|
||||
bli_cntl_free_node( gemv_cntl_ge_dot );
|
||||
bli_cntl_free_node( gemv_cntl_ge_axpy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -145,17 +145,17 @@ void bli_ger_cntl_init()
|
||||
|
||||
void bli_ger_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( ger_cntl_bs_ke_row );
|
||||
bli_cntl_obj_free( ger_cntl_bs_ke_col );
|
||||
bli_cntl_free_node( ger_cntl_bs_ke_row );
|
||||
bli_cntl_free_node( ger_cntl_bs_ke_col );
|
||||
|
||||
bli_cntl_obj_free( ger_cntl_rp_bs_row );
|
||||
bli_cntl_obj_free( ger_cntl_rp_bs_col );
|
||||
bli_cntl_free_node( ger_cntl_rp_bs_row );
|
||||
bli_cntl_free_node( ger_cntl_rp_bs_col );
|
||||
|
||||
bli_cntl_obj_free( ger_cntl_cp_bs_row );
|
||||
bli_cntl_obj_free( ger_cntl_cp_bs_col );
|
||||
bli_cntl_free_node( ger_cntl_cp_bs_row );
|
||||
bli_cntl_free_node( ger_cntl_cp_bs_col );
|
||||
|
||||
bli_cntl_obj_free( ger_cntl_ge_row );
|
||||
bli_cntl_obj_free( ger_cntl_ge_col );
|
||||
bli_cntl_free_node( ger_cntl_ge_row );
|
||||
bli_cntl_free_node( ger_cntl_ge_col );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -108,10 +108,10 @@ void bli_hemv_cntl_init()
|
||||
|
||||
void bli_hemv_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( hemv_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_obj_free( hemv_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_obj_free( hemv_cntl_ge_lrow_ucol );
|
||||
bli_cntl_obj_free( hemv_cntl_ge_lcol_urow );
|
||||
bli_cntl_free_node( hemv_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_free_node( hemv_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_free_node( hemv_cntl_ge_lrow_ucol );
|
||||
bli_cntl_free_node( hemv_cntl_ge_lcol_urow );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -97,10 +97,10 @@ void bli_her_cntl_init()
|
||||
|
||||
void bli_her_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_obj_free( her_cntl_ge_lrow_ucol );
|
||||
bli_cntl_obj_free( her_cntl_ge_lcol_urow );
|
||||
bli_cntl_free_node( her_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_free_node( her_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_free_node( her_cntl_ge_lrow_ucol );
|
||||
bli_cntl_free_node( her_cntl_ge_lcol_urow );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -101,10 +101,10 @@ void bli_her2_cntl_init()
|
||||
|
||||
void bli_her2_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_obj_free( her2_cntl_ge_lrow_ucol );
|
||||
bli_cntl_obj_free( her2_cntl_ge_lcol_urow );
|
||||
bli_cntl_free_node( her2_cntl_bs_ke_lrow_ucol );
|
||||
bli_cntl_free_node( her2_cntl_bs_ke_lcol_urow );
|
||||
bli_cntl_free_node( her2_cntl_ge_lrow_ucol );
|
||||
bli_cntl_free_node( her2_cntl_ge_lcol_urow );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -98,10 +98,10 @@ void bli_trmv_cntl_init()
|
||||
|
||||
void bli_trmv_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( trmv_cntl_bs_ke_nrow_tcol );
|
||||
bli_cntl_obj_free( trmv_cntl_bs_ke_ncol_trow );
|
||||
bli_cntl_obj_free( trmv_cntl_ge_nrow_tcol );
|
||||
bli_cntl_obj_free( trmv_cntl_ge_ncol_trow );
|
||||
bli_cntl_free_node( trmv_cntl_bs_ke_nrow_tcol );
|
||||
bli_cntl_free_node( trmv_cntl_bs_ke_ncol_trow );
|
||||
bli_cntl_free_node( trmv_cntl_ge_nrow_tcol );
|
||||
bli_cntl_free_node( trmv_cntl_ge_ncol_trow );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -101,10 +101,10 @@ void bli_trsv_cntl_init()
|
||||
|
||||
void bli_trsv_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( trsv_cntl_bs_ke_nrow_tcol );
|
||||
bli_cntl_obj_free( trsv_cntl_bs_ke_ncol_trow );
|
||||
bli_cntl_obj_free( trsv_cntl_ge_nrow_tcol );
|
||||
bli_cntl_obj_free( trsv_cntl_ge_ncol_trow );
|
||||
bli_cntl_free_node( trsv_cntl_bs_ke_nrow_tcol );
|
||||
bli_cntl_free_node( trsv_cntl_bs_ke_ncol_trow );
|
||||
bli_cntl_free_node( trsv_cntl_ge_nrow_tcol );
|
||||
bli_cntl_free_node( trsv_cntl_ge_ncol_trow );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -43,10 +43,11 @@ dim_t bli_l3_determine_kc
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
opid_t family = bli_cntx_family( cntx );
|
||||
opid_t family = bli_cntl_family( cntl );
|
||||
|
||||
if ( family == BLIS_GEMM )
|
||||
return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
|
||||
|
||||
@@ -32,6 +32,18 @@
|
||||
|
||||
*/
|
||||
|
||||
dim_t bli_l3_determine_kc
|
||||
(
|
||||
dir_t direct,
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
@@ -47,8 +59,6 @@ dim_t PASTEMAC0(opname) \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
GENPROT( l3_determine_kc )
|
||||
|
||||
GENPROT( gemm_determine_kc )
|
||||
GENPROT( herk_determine_kc )
|
||||
GENPROT( trmm_determine_kc )
|
||||
|
||||
@@ -37,10 +37,10 @@
|
||||
|
||||
void bli_l3_cntl_create_if
|
||||
(
|
||||
opid_t family,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl_orig,
|
||||
cntl_t** cntl_use
|
||||
)
|
||||
@@ -49,8 +49,6 @@ void bli_l3_cntl_create_if
|
||||
// tree as a function of the operation family.
|
||||
if ( cntl_orig == NULL )
|
||||
{
|
||||
opid_t family = bli_cntx_get_family( cntx );
|
||||
|
||||
if ( family == BLIS_GEMM ||
|
||||
family == BLIS_HERK ||
|
||||
family == BLIS_TRMM )
|
||||
@@ -73,6 +71,10 @@ void bli_l3_cntl_create_if
|
||||
// instead (so that threads can use its local tree as a place to
|
||||
// cache things like pack mem_t entries).
|
||||
*cntl_use = bli_cntl_copy( cntl_orig );
|
||||
|
||||
// Recursively set the family fields of the newly copied control tree
|
||||
// nodes.
|
||||
bli_cntl_mark_family( family, *cntl_use );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +83,6 @@ void bli_l3_cntl_free_if
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl_orig,
|
||||
cntl_t* cntl_use,
|
||||
thrinfo_t* thread
|
||||
@@ -91,7 +92,7 @@ void bli_l3_cntl_free_if
|
||||
// been created, so we now must free it.
|
||||
if ( cntl_orig == NULL )
|
||||
{
|
||||
opid_t family = bli_cntx_get_family( cntx );
|
||||
opid_t family = bli_cntl_family( cntl_use );
|
||||
|
||||
if ( family == BLIS_GEMM ||
|
||||
family == BLIS_HERK ||
|
||||
|
||||
@@ -39,10 +39,10 @@
|
||||
|
||||
void bli_l3_cntl_create_if
|
||||
(
|
||||
opid_t family,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl_orig,
|
||||
cntl_t** cntl_use
|
||||
);
|
||||
@@ -52,7 +52,6 @@ void bli_l3_cntl_free_if
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl_orig,
|
||||
cntl_t* cntl_use,
|
||||
thrinfo_t* thread
|
||||
|
||||
@@ -41,7 +41,7 @@
|
||||
void bli_gemm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -76,7 +76,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx )
|
||||
void bli_trsm_cntx_init( num_t dt, cntx_t* cntx )
|
||||
{
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
|
||||
@@ -39,11 +39,11 @@ dir_t bli_l3_direct
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// Query the operation family.
|
||||
opid_t family = bli_cntx_family( cntx );
|
||||
opid_t family = bli_cntl_family( cntl );
|
||||
|
||||
if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c );
|
||||
else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c );
|
||||
|
||||
@@ -37,7 +37,7 @@ dir_t bli_l3_direct
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -115,12 +115,13 @@ void bli_l3_packm
|
||||
// buffer, then a block has already been acquired from the memory
|
||||
// broker and cached in the control tree.
|
||||
|
||||
// BUT, we need to make sure that the mem_t object is not associated
|
||||
// with a block that is too small given the size of the packed matrix
|
||||
// that we need, according to the return value from packm_init().
|
||||
// As a sanity check, we should make sure that the mem_t object isn't
|
||||
// associated with a block that is too small compared to the size of
|
||||
// the packed matrix buffer that is needed, according to the return
|
||||
// value from packm_init().
|
||||
siz_t cntl_mem_size = bli_mem_size( cntl_mem_p );
|
||||
|
||||
if ( size_needed < cntl_mem_size )
|
||||
if ( cntl_mem_size < size_needed )
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
{
|
||||
|
||||
@@ -40,11 +40,11 @@ void bli_l3_prune_unref_mparts_m
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// Query the operation family.
|
||||
opid_t family = bli_cntx_family( cntx );
|
||||
opid_t family = bli_cntl_family( cntl );
|
||||
|
||||
if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm.
|
||||
else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c );
|
||||
@@ -61,11 +61,11 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx \
|
||||
cntl_t* cntl \
|
||||
) \
|
||||
{ \
|
||||
/* Query the operation family. */ \
|
||||
opid_t family = bli_cntx_family( cntx ); \
|
||||
opid_t family = bli_cntl_family( cntl ); \
|
||||
\
|
||||
if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \
|
||||
else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \
|
||||
|
||||
@@ -41,7 +41,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx \
|
||||
cntl_t* cntl \
|
||||
);
|
||||
|
||||
GENPROT( m )
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_gemm_blk_var1
|
||||
dim_t my_start, my_end;
|
||||
|
||||
// Determine the direction in which to partition (forwards or backwards).
|
||||
direct = bli_l3_direct( a, b, c, cntx );
|
||||
direct = bli_l3_direct( a, b, c, cntl );
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_l3_prune_unref_mparts_m( a, b, c, cntx );
|
||||
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_mdim
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_gemm_blk_var2
|
||||
dim_t my_start, my_end;
|
||||
|
||||
// Determine the direction in which to partition (forwards or backwards).
|
||||
direct = bli_l3_direct( a, b, c, cntx );
|
||||
direct = bli_l3_direct( a, b, c, cntl );
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_l3_prune_unref_mparts_n( a, b, c, cntx );
|
||||
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_ndim
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_gemm_blk_var3
|
||||
dim_t k_trans;
|
||||
|
||||
// Determine the direction in which to partition (forwards or backwards).
|
||||
direct = bli_l3_direct( a, b, c, cntx );
|
||||
direct = bli_l3_direct( a, b, c, cntl );
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_l3_prune_unref_mparts_k( a, b, c, cntx );
|
||||
bli_l3_prune_unref_mparts_k( a, b, c, cntl );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
@@ -66,7 +66,7 @@ void bli_gemm_blk_var3
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b,
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx, cntl );
|
||||
|
||||
// Acquire partitions for A1 and B1.
|
||||
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
|
||||
@@ -109,7 +109,7 @@ void bli_gemm_blk_var3
|
||||
// row-panel of C, and thus beta is applied to all of C exactly once.
|
||||
// Thus, for neither trmm nor trmm3 should we reset the scalar on C
|
||||
// after the first iteration.
|
||||
if ( bli_cntx_get_family( cntx ) != BLIS_TRMM )
|
||||
if ( bli_cntl_family( cntl ) != BLIS_TRMM )
|
||||
if ( i == 0 ) bli_obj_scalar_reset( c );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,22 +56,24 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
|
||||
(
|
||||
family, // the operation family
|
||||
BLIS_MR, // needed for bli_thrinfo_rgrow()
|
||||
NULL, // variant function pointer not used
|
||||
NULL // no sub-node; this is the leaf of the tree.
|
||||
);
|
||||
|
||||
cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
|
||||
macro_kernel_p,
|
||||
gemm_cntl_bu_ke
|
||||
);
|
||||
|
||||
// Create a node for packing matrix A.
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_gemm_packa, // pack the left-hand operand
|
||||
bli_packm_blk_var1,
|
||||
@@ -86,15 +88,16 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the m dimension by MC.
|
||||
cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_MC,
|
||||
bli_gemm_blk_var1,
|
||||
gemm_cntl_packa
|
||||
);
|
||||
|
||||
// Create a node for packing matrix B.
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
@@ -109,16 +112,18 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the k dimension by KC.
|
||||
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_KC,
|
||||
bli_gemm_blk_var3,
|
||||
gemm_cntl_packb
|
||||
);
|
||||
|
||||
// Create a node for partitioning the n dimension by NC.
|
||||
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NC,
|
||||
bli_gemm_blk_var2,
|
||||
gemm_cntl_mm_op
|
||||
@@ -141,15 +146,17 @@ cntl_t* bli_gemmpb_cntl_create
|
||||
//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node
|
||||
(
|
||||
family, // the operation family
|
||||
BLIS_MR, // needed for bli_thrinfo_rgrow()
|
||||
NULL, // variant function pointer not used
|
||||
NULL // no sub-node; this is the leaf of the tree.
|
||||
);
|
||||
|
||||
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
|
||||
macro_kernel_p,
|
||||
gemm_cntl_ub_ke
|
||||
@@ -157,7 +164,7 @@ cntl_t* bli_gemmpb_cntl_create
|
||||
|
||||
// Create a node for packing matrix A (which is really the right-hand
|
||||
// operand "B").
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
@@ -172,8 +179,9 @@ cntl_t* bli_gemmpb_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the n dimension by MC.
|
||||
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_MC,
|
||||
bli_gemm_blk_var2,
|
||||
gemm_cntl_packb
|
||||
@@ -181,7 +189,7 @@ cntl_t* bli_gemmpb_cntl_create
|
||||
|
||||
// Create a node for packing matrix B (which is really the left-hand
|
||||
// operand "A").
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_gemm_packa, // pack the left-hand operand
|
||||
bli_packm_blk_var1,
|
||||
@@ -196,16 +204,18 @@ cntl_t* bli_gemmpb_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the k dimension by KC.
|
||||
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_KC,
|
||||
bli_gemm_blk_var3,
|
||||
gemm_cntl_packa
|
||||
);
|
||||
|
||||
// Create a node for partitioning the m dimension by NC.
|
||||
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
|
||||
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NC,
|
||||
bli_gemm_blk_var1,
|
||||
gemm_cntl_mm_op
|
||||
@@ -227,13 +237,14 @@ void bli_gemm_cntl_free
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemm_cntl_obj_create
|
||||
cntl_t* bli_gemm_cntl_create_node
|
||||
(
|
||||
opid_t family,
|
||||
bszid_t bszid,
|
||||
void* var_func,
|
||||
cntl_t* sub_node
|
||||
)
|
||||
{
|
||||
return bli_cntl_obj_create( bszid, var_func, NULL, sub_node );
|
||||
return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node );
|
||||
}
|
||||
|
||||
|
||||
@@ -59,8 +59,9 @@ void bli_gemm_cntl_free
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_gemm_cntl_obj_create
|
||||
cntl_t* bli_gemm_cntl_create_node
|
||||
(
|
||||
opid_t family,
|
||||
bszid_t bszid,
|
||||
void* var_func,
|
||||
cntl_t* sub_node
|
||||
|
||||
@@ -46,70 +46,68 @@ void bli_gemm_front
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
|
||||
#ifdef BLIS_SMALL_MATRIX_ENABLE
|
||||
gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl);
|
||||
if(BLIS_SUCCESS != status)
|
||||
#endif
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If alpha is zero, scale by beta and return.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
// If alpha is zero, scale by beta and return.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Reinitialize the memory allocator to accommodate the blocksizes
|
||||
// in the current context.
|
||||
bli_memsys_reinit( cntx );
|
||||
// Reinitialize the memory allocator to accommodate the blocksizes
|
||||
// in the current context.
|
||||
bli_memsys_reinit( cntx );
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *b, b_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( *a, a_local );
|
||||
bli_obj_alias_to( *b, b_local );
|
||||
bli_obj_alias_to( *c, c_local );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( a_local, b_local );
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( a_local, b_local );
|
||||
|
||||
bli_obj_induce_trans( a_local );
|
||||
bli_obj_induce_trans( b_local );
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
bli_obj_induce_trans( a_local );
|
||||
bli_obj_induce_trans( b_local );
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_GEMM, cntx );
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
bli_obj_width( c_local ),
|
||||
bli_obj_width( a_local ) );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
bli_obj_width( c_local ),
|
||||
bli_obj_width( a_local ) );
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntx,
|
||||
cntl
|
||||
);
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntx,
|
||||
cntl
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -89,9 +89,6 @@ void bli_hemm_front
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_GEMM, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -102,6 +99,7 @@ void bli_hemm_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
@@ -107,9 +107,6 @@ void bli_her2k_front
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_HERK, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -122,6 +119,7 @@ void bli_her2k_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_HERK, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&bh_local,
|
||||
@@ -134,6 +132,7 @@ void bli_her2k_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_HERK, // operation family id
|
||||
&alpha_conj,
|
||||
&b_local,
|
||||
&ah_local,
|
||||
|
||||
@@ -87,9 +87,6 @@ void bli_herk_front
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_HERK, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -100,6 +97,7 @@ void bli_herk_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_HERK, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&ah_local,
|
||||
|
||||
@@ -88,9 +88,6 @@ void bli_symm_front
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_GEMM, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -101,6 +98,7 @@ void bli_symm_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
@@ -88,9 +88,6 @@ void bli_syr2k_front
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_HERK, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -103,6 +100,7 @@ void bli_syr2k_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_HERK, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&bt_local,
|
||||
@@ -115,6 +113,7 @@ void bli_syr2k_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_HERK, // operation family id
|
||||
alpha,
|
||||
&b_local,
|
||||
&at_local,
|
||||
|
||||
@@ -81,9 +81,6 @@ void bli_syrk_front
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_HERK, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -94,6 +91,7 @@ void bli_syrk_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_HERK, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&at_local,
|
||||
|
||||
@@ -131,9 +131,6 @@ void bli_trmm_front
|
||||
bli_obj_set_as_root( b_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_TRMM, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -144,6 +141,7 @@ void bli_trmm_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_TRMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
@@ -130,9 +130,6 @@ void bli_trmm3_front
|
||||
bli_obj_set_as_root( b_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_TRMM, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -143,6 +140,7 @@ void bli_trmm3_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_TRMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_trsm_blk_var1
|
||||
dim_t my_start, my_end;
|
||||
|
||||
// Determine the direction in which to partition (forwards or backwards).
|
||||
direct = bli_l3_direct( a, b, c, cntx );
|
||||
direct = bli_l3_direct( a, b, c, cntl );
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_l3_prune_unref_mparts_m( a, b, c, cntx );
|
||||
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_mdim
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_trsm_blk_var2
|
||||
dim_t my_start, my_end;
|
||||
|
||||
// Determine the direction in which to partition (forwards or backwards).
|
||||
direct = bli_l3_direct( a, b, c, cntx );
|
||||
direct = bli_l3_direct( a, b, c, cntl );
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_l3_prune_unref_mparts_n( a, b, c, cntx );
|
||||
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_ndim
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_trsm_blk_var3
|
||||
dim_t k_trans;
|
||||
|
||||
// Determine the direction in which to partition (forwards or backwards).
|
||||
direct = bli_l3_direct( a, b, c, cntx );
|
||||
direct = bli_l3_direct( a, b, c, cntl );
|
||||
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_l3_prune_unref_mparts_k( a, b, c, cntx );
|
||||
bli_l3_prune_unref_mparts_k( a, b, c, cntl );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
@@ -50,23 +50,27 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
{
|
||||
void* macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
|
||||
const opid_t family = BLIS_TRSM;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
|
||||
(
|
||||
family, // the operation family
|
||||
BLIS_MR, // needed for bli_thrinfo_rgrow()
|
||||
NULL, // variant function pointer not used
|
||||
NULL // no sub-node; this is the leaf of the tree.
|
||||
);
|
||||
|
||||
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
|
||||
macro_kernel_p,
|
||||
trsm_cntl_bu_ke
|
||||
);
|
||||
|
||||
// Create a node for packing matrix A.
|
||||
cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_trsm_packa,
|
||||
bli_packm_blk_var1,
|
||||
@@ -81,15 +85,16 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the m dimension by MC.
|
||||
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_MC,
|
||||
bli_trsm_blk_var1,
|
||||
trsm_cntl_packa
|
||||
);
|
||||
|
||||
// Create a node for packing matrix B.
|
||||
cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_trsm_packb,
|
||||
bli_packm_blk_var1,
|
||||
@@ -104,16 +109,18 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the k dimension by KC.
|
||||
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_KC,
|
||||
bli_trsm_blk_var3,
|
||||
trsm_cntl_packb
|
||||
);
|
||||
|
||||
// Create a node for partitioning the n dimension by NC.
|
||||
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NC,
|
||||
bli_trsm_blk_var2,
|
||||
trsm_cntl_mm_op
|
||||
@@ -129,23 +136,27 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
{
|
||||
void* macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
|
||||
const opid_t family = BLIS_TRSM;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_MR, // needed for bli_thrinfo_rgrow()
|
||||
NULL, // variant function pointer not used
|
||||
NULL // no sub-node; this is the leaf of the tree.
|
||||
);
|
||||
|
||||
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
|
||||
macro_kernel_p,
|
||||
trsm_cntl_bu_ke
|
||||
);
|
||||
|
||||
// Create a node for packing matrix A.
|
||||
cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_trsm_packa,
|
||||
bli_packm_blk_var1,
|
||||
@@ -160,15 +171,16 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the m dimension by MC.
|
||||
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_MC,
|
||||
bli_trsm_blk_var1,
|
||||
trsm_cntl_packa
|
||||
);
|
||||
|
||||
// Create a node for packing matrix B.
|
||||
cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
|
||||
(
|
||||
bli_trsm_packb,
|
||||
bli_packm_blk_var1,
|
||||
@@ -183,16 +195,18 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
);
|
||||
|
||||
// Create a node for partitioning the k dimension by KC.
|
||||
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_KC,
|
||||
bli_trsm_blk_var3,
|
||||
trsm_cntl_packb
|
||||
);
|
||||
|
||||
// Create a node for partitioning the n dimension by NC.
|
||||
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create
|
||||
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
|
||||
(
|
||||
family,
|
||||
BLIS_NC,
|
||||
bli_trsm_blk_var2,
|
||||
trsm_cntl_mm_op
|
||||
@@ -212,13 +226,14 @@ void bli_trsm_cntl_free
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_trsm_cntl_obj_create
|
||||
cntl_t* bli_trsm_cntl_create_node
|
||||
(
|
||||
opid_t family,
|
||||
bszid_t bszid,
|
||||
void* var_func,
|
||||
cntl_t* sub_node
|
||||
)
|
||||
{
|
||||
return bli_cntl_obj_create( bszid, var_func, NULL, sub_node );
|
||||
return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node );
|
||||
}
|
||||
|
||||
|
||||
@@ -55,8 +55,9 @@ void bli_trsm_cntl_free
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
cntl_t* bli_trsm_cntl_obj_create
|
||||
cntl_t* bli_trsm_cntl_create_node
|
||||
(
|
||||
opid_t family,
|
||||
bszid_t bszid,
|
||||
void* var_func,
|
||||
cntl_t* sub_node
|
||||
|
||||
@@ -122,9 +122,6 @@ void bli_trsm_front
|
||||
bli_obj_set_as_root( b_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// Set the operation family id in the context.
|
||||
bli_cntx_set_family( BLIS_TRSM, cntx );
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
@@ -135,6 +132,7 @@ void bli_trsm_front
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_trsm_int,
|
||||
BLIS_TRSM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
|
||||
@@ -64,7 +64,7 @@ void bli_trsm_cntl_init()
|
||||
// Create control tree objects for packm operations (left side).
|
||||
trsm_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_packm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: n dim multiple must be mr to
|
||||
// support right and bottom-right edge cases
|
||||
@@ -78,7 +78,7 @@ void bli_trsm_cntl_init()
|
||||
|
||||
trsm_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_packm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: m dim multiple must be mr since
|
||||
// B_pack is updated (ie: serves as C) in trsm
|
||||
@@ -93,7 +93,7 @@ void bli_trsm_cntl_init()
|
||||
// Create control tree objects for packm operations (right side).
|
||||
trsm_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_packm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
BLIS_NR,
|
||||
BLIS_MR,
|
||||
@@ -105,7 +105,7 @@ void bli_trsm_cntl_init()
|
||||
|
||||
trsm_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_packm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1, // pack panels of B compactly
|
||||
BLIS_MR,
|
||||
BLIS_MR,
|
||||
@@ -119,7 +119,7 @@ void bli_trsm_cntl_init()
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
trsm_cntl_bp_ke
|
||||
=
|
||||
bli_trsm_cntl_obj_create( BLIS_UNB_OPT,
|
||||
bli_trsm_cntl_create_node( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT2,
|
||||
0, // bszid_t not used by macro-kernel
|
||||
NULL, NULL, NULL, NULL,
|
||||
@@ -129,7 +129,7 @@ void bli_trsm_cntl_init()
|
||||
// problem (left side).
|
||||
trsm_l_cntl_op_bp
|
||||
=
|
||||
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_trsm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
BLIS_MC,
|
||||
NULL,
|
||||
@@ -144,7 +144,7 @@ void bli_trsm_cntl_init()
|
||||
// rank-k (outer panel) updates (left side).
|
||||
trsm_l_cntl_mm_op
|
||||
=
|
||||
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_trsm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_KC,
|
||||
NULL,
|
||||
@@ -159,7 +159,7 @@ void bli_trsm_cntl_init()
|
||||
// general problems (left side).
|
||||
trsm_l_cntl_vl_mm
|
||||
=
|
||||
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_trsm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_NC,
|
||||
NULL,
|
||||
@@ -174,7 +174,7 @@ void bli_trsm_cntl_init()
|
||||
// problem (right side).
|
||||
trsm_r_cntl_op_bp
|
||||
=
|
||||
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_trsm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
BLIS_MC,
|
||||
NULL,
|
||||
@@ -189,7 +189,7 @@ void bli_trsm_cntl_init()
|
||||
// rank-k (outer panel) updates (right side).
|
||||
trsm_r_cntl_mm_op
|
||||
=
|
||||
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_trsm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
BLIS_KC,
|
||||
NULL,
|
||||
@@ -204,7 +204,7 @@ void bli_trsm_cntl_init()
|
||||
// general problems (right side).
|
||||
trsm_r_cntl_vl_mm
|
||||
=
|
||||
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
|
||||
bli_trsm_cntl_create_node( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
BLIS_NC,
|
||||
NULL,
|
||||
@@ -222,22 +222,22 @@ void bli_trsm_cntl_init()
|
||||
|
||||
void bli_trsm_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( trsm_l_packa_cntl );
|
||||
bli_cntl_obj_free( trsm_l_packb_cntl );
|
||||
bli_cntl_obj_free( trsm_r_packa_cntl );
|
||||
bli_cntl_obj_free( trsm_r_packb_cntl );
|
||||
bli_cntl_free_node( trsm_l_packa_cntl );
|
||||
bli_cntl_free_node( trsm_l_packb_cntl );
|
||||
bli_cntl_free_node( trsm_r_packa_cntl );
|
||||
bli_cntl_free_node( trsm_r_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( trsm_cntl_bp_ke );
|
||||
bli_cntl_free_node( trsm_cntl_bp_ke );
|
||||
|
||||
bli_cntl_obj_free( trsm_l_cntl_op_bp );
|
||||
bli_cntl_obj_free( trsm_l_cntl_mm_op );
|
||||
bli_cntl_obj_free( trsm_l_cntl_vl_mm );
|
||||
bli_cntl_obj_free( trsm_r_cntl_op_bp );
|
||||
bli_cntl_obj_free( trsm_r_cntl_mm_op );
|
||||
bli_cntl_obj_free( trsm_r_cntl_vl_mm );
|
||||
bli_cntl_free_node( trsm_l_cntl_op_bp );
|
||||
bli_cntl_free_node( trsm_l_cntl_mm_op );
|
||||
bli_cntl_free_node( trsm_l_cntl_vl_mm );
|
||||
bli_cntl_free_node( trsm_r_cntl_op_bp );
|
||||
bli_cntl_free_node( trsm_r_cntl_mm_op );
|
||||
bli_cntl_free_node( trsm_r_cntl_vl_mm );
|
||||
}
|
||||
|
||||
trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type,
|
||||
trsm_t* bli_trsm_cntl_create_node( impl_t impl_type,
|
||||
varnum_t var_num,
|
||||
bszid_t bszid,
|
||||
scalm_t* sub_scalm,
|
||||
|
||||
@@ -51,7 +51,7 @@ typedef struct trsm_s trsm_t;
|
||||
|
||||
void bli_trsm_cntl_init( void );
|
||||
void bli_trsm_cntl_finalize( void );
|
||||
trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type,
|
||||
trsm_t* bli_trsm_cntl_create_node( impl_t impl_type,
|
||||
varnum_t var_num,
|
||||
bszid_t bszid,
|
||||
scalm_t* sub_scalm,
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
blksz_t* bli_blksz_obj_create
|
||||
blksz_t* bli_blksz_create_ed
|
||||
(
|
||||
dim_t b_s, dim_t be_s,
|
||||
dim_t b_d, dim_t be_d,
|
||||
@@ -47,16 +47,39 @@ blksz_t* bli_blksz_obj_create
|
||||
|
||||
b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) );
|
||||
|
||||
bli_blksz_obj_init( b,
|
||||
b_s, be_s,
|
||||
b_d, be_d,
|
||||
b_c, be_c,
|
||||
b_z, be_z );
|
||||
bli_blksz_init_ed
|
||||
(
|
||||
b,
|
||||
b_s, be_s,
|
||||
b_d, be_d,
|
||||
b_c, be_c,
|
||||
b_z, be_z
|
||||
);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
void bli_blksz_obj_init
|
||||
blksz_t* bli_blksz_create
|
||||
(
|
||||
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
|
||||
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
|
||||
)
|
||||
{
|
||||
blksz_t* b;
|
||||
|
||||
b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) );
|
||||
|
||||
bli_blksz_init
|
||||
(
|
||||
b,
|
||||
b_s, b_d, b_c, b_z,
|
||||
be_s, be_d, be_c, be_z
|
||||
);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
void bli_blksz_init_ed
|
||||
(
|
||||
blksz_t* b,
|
||||
dim_t b_s, dim_t be_s,
|
||||
@@ -75,7 +98,45 @@ void bli_blksz_obj_init
|
||||
b->e[BLIS_DCOMPLEX] = be_z;
|
||||
}
|
||||
|
||||
void bli_blksz_obj_free
|
||||
void bli_blksz_init
|
||||
(
|
||||
blksz_t* b,
|
||||
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
|
||||
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
|
||||
)
|
||||
{
|
||||
b->v[BLIS_FLOAT] = b_s;
|
||||
b->v[BLIS_DOUBLE] = b_d;
|
||||
b->v[BLIS_SCOMPLEX] = b_c;
|
||||
b->v[BLIS_DCOMPLEX] = b_z;
|
||||
|
||||
// Interpret a zero as a request for the default value.
|
||||
b->e[BLIS_FLOAT] = ( be_s == 0 ? b_s : be_s );
|
||||
b->e[BLIS_DOUBLE] = ( be_d == 0 ? b_d : be_d );
|
||||
b->e[BLIS_SCOMPLEX] = ( be_c == 0 ? b_c : be_c );
|
||||
b->e[BLIS_DCOMPLEX] = ( be_z == 0 ? b_z : be_z );
|
||||
}
|
||||
|
||||
void bli_blksz_init_easy
|
||||
(
|
||||
blksz_t* b,
|
||||
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z
|
||||
)
|
||||
{
|
||||
b->v[BLIS_FLOAT] = b_s;
|
||||
b->v[BLIS_DOUBLE] = b_d;
|
||||
b->v[BLIS_SCOMPLEX] = b_c;
|
||||
b->v[BLIS_DCOMPLEX] = b_z;
|
||||
|
||||
// Here we assume the maximum blocksize values can be the same as the
|
||||
// default values.
|
||||
b->e[BLIS_FLOAT] = b_s;
|
||||
b->e[BLIS_DOUBLE] = b_d;
|
||||
b->e[BLIS_SCOMPLEX] = b_c;
|
||||
b->e[BLIS_DCOMPLEX] = b_z;
|
||||
}
|
||||
|
||||
void bli_blksz_free
|
||||
(
|
||||
blksz_t* b
|
||||
)
|
||||
@@ -302,6 +363,11 @@ dim_t bli_determine_blocksize_b_sub
|
||||
// chunk that will correspond to the blocksize we are computing now.
|
||||
dim_left_now = dim - i;
|
||||
|
||||
// Sanity check: if dim_left_now is zero, then we can return zero
|
||||
// without going any further.
|
||||
if ( dim_left_now == 0 )
|
||||
return 0;
|
||||
|
||||
dim_at_edge = dim_left_now % b_alg;
|
||||
|
||||
// If dim_left_now is a multiple of b_alg, we can safely return b_alg
|
||||
|
||||
@@ -50,15 +50,6 @@
|
||||
*(max) = bli_blksz_get_max( dt, b ); \
|
||||
}
|
||||
|
||||
#define bli_blksz_get_def_for_obj( obj, b ) \
|
||||
\
|
||||
bli_blksz_get_def( bli_obj_datatype( *(obj) ), b )
|
||||
|
||||
#define bli_blksz_get_max_for_obj( obj, b ) \
|
||||
\
|
||||
bli_blksz_get_max( bli_obj_datatype( *(obj) ), b )
|
||||
|
||||
|
||||
// blksz_t modification
|
||||
|
||||
#define bli_blksz_set_def( val, dt, b ) \
|
||||
@@ -85,8 +76,11 @@
|
||||
#define bli_blksz_copy_dt( dt_src, b_src, \
|
||||
dt_dst, b_dst ) \
|
||||
{ \
|
||||
(b_dst)->v[ dt_dst ] = (b_src)->v[ dt_src ]; \
|
||||
(b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \
|
||||
const dim_t v_src = bli_blksz_get_def( dt_src, b_src ); \
|
||||
const dim_t e_src = bli_blksz_get_max( dt_src, b_src ); \
|
||||
\
|
||||
bli_blksz_set_def( v_src, dt_dst, b_dst ); \
|
||||
bli_blksz_set_max( e_src, dt_dst, b_dst ); \
|
||||
}
|
||||
|
||||
#define bli_blksz_scale_def( num, den, dt, b ) \
|
||||
@@ -109,7 +103,7 @@
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
blksz_t* bli_blksz_obj_create
|
||||
blksz_t* bli_blksz_create_ed
|
||||
(
|
||||
dim_t b_s, dim_t be_s,
|
||||
dim_t b_d, dim_t be_d,
|
||||
@@ -117,7 +111,13 @@ blksz_t* bli_blksz_obj_create
|
||||
dim_t b_z, dim_t be_z
|
||||
);
|
||||
|
||||
void bli_blksz_obj_init
|
||||
blksz_t* bli_blksz_create
|
||||
(
|
||||
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
|
||||
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
|
||||
);
|
||||
|
||||
void bli_blksz_init_ed
|
||||
(
|
||||
blksz_t* b,
|
||||
dim_t b_s, dim_t be_s,
|
||||
@@ -126,7 +126,20 @@ void bli_blksz_obj_init
|
||||
dim_t b_z, dim_t be_z
|
||||
);
|
||||
|
||||
void bli_blksz_obj_free
|
||||
void bli_blksz_init
|
||||
(
|
||||
blksz_t* b,
|
||||
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
|
||||
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
|
||||
);
|
||||
|
||||
void bli_blksz_init_easy
|
||||
(
|
||||
blksz_t* b,
|
||||
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z
|
||||
);
|
||||
|
||||
void bli_blksz_free
|
||||
(
|
||||
blksz_t* b
|
||||
);
|
||||
|
||||
@@ -34,8 +34,9 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
cntl_t* bli_cntl_obj_create
|
||||
cntl_t* bli_cntl_create_node
|
||||
(
|
||||
opid_t family,
|
||||
bszid_t bszid,
|
||||
void* var_func,
|
||||
void* params,
|
||||
@@ -48,6 +49,7 @@ cntl_t* bli_cntl_obj_create
|
||||
// Allocate the cntl_t struct.
|
||||
cntl = bli_malloc_intl( sizeof( cntl_t ) );
|
||||
|
||||
bli_cntl_set_family( family, cntl );
|
||||
bli_cntl_set_bszid( bszid, cntl );
|
||||
bli_cntl_set_var_func( var_func, cntl );
|
||||
bli_cntl_set_params( params, cntl );
|
||||
@@ -63,7 +65,7 @@ cntl_t* bli_cntl_obj_create
|
||||
return cntl;
|
||||
}
|
||||
|
||||
void bli_cntl_obj_free
|
||||
void bli_cntl_free_node
|
||||
(
|
||||
cntl_t* cntl
|
||||
)
|
||||
@@ -71,7 +73,7 @@ void bli_cntl_obj_free
|
||||
bli_free_intl( cntl );
|
||||
}
|
||||
|
||||
void bli_cntl_obj_clear
|
||||
void bli_cntl_clear_node
|
||||
(
|
||||
cntl_t* cntl
|
||||
)
|
||||
@@ -141,7 +143,7 @@ void bli_cntl_free_w_thrinfo
|
||||
}
|
||||
|
||||
// Free the current node.
|
||||
bli_cntl_obj_free( cntl );
|
||||
bli_cntl_free_node( cntl );
|
||||
}
|
||||
|
||||
void bli_cntl_free_wo_thrinfo
|
||||
@@ -177,7 +179,7 @@ void bli_cntl_free_wo_thrinfo
|
||||
}
|
||||
|
||||
// Free the current node.
|
||||
bli_cntl_obj_free( cntl );
|
||||
bli_cntl_free_node( cntl );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
@@ -189,10 +191,11 @@ cntl_t* bli_cntl_copy
|
||||
{
|
||||
// Make a copy of the current node. Notice that the source node
|
||||
// should NOT have any allocated/cached mem_t entries, and that
|
||||
// bli_cntl_obj_create() creates a node with a cleared mem_t
|
||||
// bli_cntl_create_node() creates a node with a cleared mem_t
|
||||
// field.
|
||||
cntl_t* cntl_copy = bli_cntl_obj_create
|
||||
cntl_t* cntl_copy = bli_cntl_create_node
|
||||
(
|
||||
bli_cntl_family( cntl ),
|
||||
bli_cntl_bszid( cntl ),
|
||||
bli_cntl_var_func( cntl ),
|
||||
NULL, NULL
|
||||
@@ -234,3 +237,23 @@ cntl_t* bli_cntl_copy
|
||||
return cntl_copy;
|
||||
}
|
||||
|
||||
void bli_cntl_mark_family
|
||||
(
|
||||
opid_t family,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// Set the family of the root node.
|
||||
bli_cntl_set_family( family, cntl );
|
||||
|
||||
// Continue as long as the current node has a valid child.
|
||||
while ( bli_cntl_sub_node( cntl ) != NULL )
|
||||
{
|
||||
// Move down the tree to the child node.
|
||||
cntl = bli_cntl_sub_node( cntl );
|
||||
|
||||
// Set the family of the current node.
|
||||
bli_cntl_set_family( family, cntl );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
struct cntl_s
|
||||
{
|
||||
// Basic fields (usually required).
|
||||
opid_t family;
|
||||
bszid_t bszid;
|
||||
void* var_func;
|
||||
struct cntl_s* sub_node;
|
||||
@@ -57,20 +58,21 @@ typedef struct cntl_s cntl_t;
|
||||
|
||||
// -- Control tree prototypes --
|
||||
|
||||
cntl_t* bli_cntl_obj_create
|
||||
cntl_t* bli_cntl_create_node
|
||||
(
|
||||
opid_t family,
|
||||
bszid_t bszid,
|
||||
void* var_func,
|
||||
void* params,
|
||||
cntl_t* sub_node
|
||||
);
|
||||
|
||||
void bli_cntl_obj_free
|
||||
void bli_cntl_free_node
|
||||
(
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
void bli_cntl_obj_clear
|
||||
void bli_cntl_clear_node
|
||||
(
|
||||
cntl_t* cntl
|
||||
);
|
||||
@@ -99,10 +101,20 @@ cntl_t* bli_cntl_copy
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
void bli_cntl_mark_family
|
||||
(
|
||||
opid_t family,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// cntl_t query (fields only)
|
||||
|
||||
#define bli_cntl_family( cntl ) \
|
||||
\
|
||||
( cntl->family )
|
||||
|
||||
#define bli_cntl_bszid( cntl ) \
|
||||
\
|
||||
( cntl->bszid )
|
||||
@@ -139,6 +151,11 @@ cntl_t* bli_cntl_copy
|
||||
|
||||
// cntl_t modification
|
||||
|
||||
#define bli_cntl_set_family( family0, cntl ) \
|
||||
{ \
|
||||
cntl->family = family0; \
|
||||
}
|
||||
|
||||
#define bli_cntl_set_bszid( bszid0, cntl ) \
|
||||
{ \
|
||||
cntl->bszid = bszid0; \
|
||||
|
||||
@@ -39,14 +39,14 @@
|
||||
// NOTE: Since these functions currently do nothing, they are defined
|
||||
// as empty macros in bli_cntx.
|
||||
//
|
||||
void bli_cntx_obj_create( cntx_t* cntx )
|
||||
void bli_cntx_create( cntx_t* cntx )
|
||||
{
|
||||
// Since cntx_t objects contain statically-allocated arrays,
|
||||
// we don't need to do anything in order to create the cntx_t
|
||||
// instance.
|
||||
}
|
||||
|
||||
void bli_cntx_obj_free( cntx_t* cntx )
|
||||
void bli_cntx_free( cntx_t* cntx )
|
||||
{
|
||||
// Just as we don't need to do anything in order to create a
|
||||
// cntx_t instance, we don't need to do anything to destory
|
||||
@@ -54,7 +54,7 @@ void bli_cntx_obj_free( cntx_t* cntx )
|
||||
}
|
||||
#endif
|
||||
|
||||
void bli_cntx_obj_clear( cntx_t* cntx )
|
||||
void bli_cntx_clear( cntx_t* cntx )
|
||||
{
|
||||
// Fill the entire cntx_t structure with zeros.
|
||||
memset( ( void* )cntx, 0, sizeof( cntx ) );
|
||||
@@ -108,8 +108,11 @@ void bli_cntx_init( cntx_t* cntx )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
blksz_t* bli_cntx_get_blksz( bszid_t bs_id,
|
||||
cntx_t* cntx )
|
||||
blksz_t* bli_cntx_get_blksz
|
||||
(
|
||||
bszid_t bs_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
|
||||
blksz_t* blksz = &blkszs[ bs_id ];
|
||||
@@ -142,8 +145,11 @@ dim_t bli_cntx_get_blksz_max_dt( num_t dt,
|
||||
}
|
||||
#endif
|
||||
|
||||
blksz_t* bli_cntx_get_bmult( bszid_t bs_id,
|
||||
cntx_t* cntx )
|
||||
blksz_t* bli_cntx_get_bmult
|
||||
(
|
||||
bszid_t bs_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
|
||||
bszid_t* bmults = bli_cntx_bmults_buf( cntx );
|
||||
@@ -166,8 +172,11 @@ dim_t bli_cntx_get_bmult_dt( num_t dt,
|
||||
}
|
||||
#endif
|
||||
|
||||
func_t* bli_cntx_get_l3_ukr( l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
func_t* bli_cntx_get_l3_ukr
|
||||
(
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
|
||||
@@ -210,8 +219,11 @@ void* bli_cntx_get_l3_ukr_dt( num_t dt,
|
||||
}
|
||||
#endif
|
||||
|
||||
func_t* bli_cntx_get_l3_vir_ukr( l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
func_t* bli_cntx_get_l3_vir_ukr
|
||||
(
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
func_t* l3_vir_ukr = &l3_vir_ukrs[ ukr_id ];
|
||||
@@ -235,8 +247,11 @@ void* bli_cntx_get_l3_vir_ukr_dt( num_t dt,
|
||||
}
|
||||
#endif
|
||||
|
||||
func_t* bli_cntx_get_l3_nat_ukr( l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
func_t* bli_cntx_get_l3_nat_ukr
|
||||
(
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
|
||||
func_t* l3_nat_ukr = &l3_nat_ukrs[ ukr_id ];
|
||||
@@ -260,8 +275,11 @@ void* bli_cntx_get_l3_nat_ukr_dt( num_t dt,
|
||||
}
|
||||
#endif
|
||||
|
||||
func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id,
|
||||
cntx_t* cntx )
|
||||
func_t* bli_cntx_get_l1f_ker
|
||||
(
|
||||
l1fkr_t ker_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx );
|
||||
func_t* l1f_ker = &l1f_kers[ ker_id ];
|
||||
@@ -283,8 +301,11 @@ void* bli_cntx_get_l1f_ker_dt( num_t dt,
|
||||
}
|
||||
#endif
|
||||
|
||||
func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id,
|
||||
cntx_t* cntx )
|
||||
func_t* bli_cntx_get_l1v_ker
|
||||
(
|
||||
l1vkr_t ker_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx );
|
||||
func_t* l1v_ker = &l1v_kers[ ker_id ];
|
||||
@@ -306,8 +327,11 @@ void* bli_cntx_get_l1v_ker_dt( num_t dt,
|
||||
}
|
||||
#endif
|
||||
|
||||
mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
mbool_t* bli_cntx_get_l3_nat_ukr_prefs
|
||||
(
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
|
||||
mbool_t* l3_nat_ukrs_pref = &l3_nat_ukrs_prefs[ ukr_id ];
|
||||
@@ -316,12 +340,30 @@ mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id,
|
||||
return l3_nat_ukrs_pref;
|
||||
}
|
||||
|
||||
func_t* bli_cntx_get_packm_ukr( cntx_t* cntx )
|
||||
func_t* bli_cntx_get_packm_ker
|
||||
(
|
||||
l1mkr_t ker_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx );
|
||||
func_t* packm_kers = bli_cntx_packm_kers_buf( cntx );
|
||||
func_t* packm_ker = &packm_kers[ ker_id ];
|
||||
|
||||
// Return the address of the func_t that contains the packm ukernels.
|
||||
return packm_ukrs;
|
||||
return packm_ker;
|
||||
}
|
||||
|
||||
func_t* bli_cntx_get_unpackm_ker
|
||||
(
|
||||
l1mkr_t ker_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* unpackm_kers = bli_cntx_unpackm_kers_buf( cntx );
|
||||
func_t* unpackm_ker = &unpackm_kers[ ker_id ];
|
||||
|
||||
// Return the address of the func_t that contains the unpackm ukernels.
|
||||
return unpackm_ker;
|
||||
}
|
||||
|
||||
#if 0
|
||||
@@ -360,7 +402,11 @@ dim_t bli_cntx_get_num_threads( cntx_t* cntx )
|
||||
bli_cntx_ir_way( cntx );
|
||||
}
|
||||
|
||||
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl )
|
||||
dim_t bli_cntx_get_num_threads_in
|
||||
(
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
dim_t n_threads_in = 1;
|
||||
|
||||
@@ -384,14 +430,6 @@ dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#if 1
|
||||
//
|
||||
// NOTE: This function is disabled because:
|
||||
// - we currently do not have any need to set a context direclty with
|
||||
// blksz_t objects
|
||||
// - it may be broken; it needs to be synced up with the corresponding
|
||||
// function in bli_gks.c.
|
||||
//
|
||||
void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
{
|
||||
/* Example prototypes:
|
||||
@@ -454,8 +492,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// Here, we query the variable argument list for:
|
||||
// - the bszid_t of the blocksize we're about to process,
|
||||
// - the address of the blksz_t object, and
|
||||
// - the bszid_t of the multiple we need to associate with
|
||||
// the blksz_t object.
|
||||
// - the bszid_t of the multiple
|
||||
// that we need to associate with the blksz_t object.
|
||||
bszid_t bs_id = va_arg( args, bszid_t );
|
||||
blksz_t* blksz = va_arg( args, blksz_t* );
|
||||
bszid_t bm_id = va_arg( args, bszid_t );
|
||||
@@ -473,9 +511,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
{
|
||||
// Here, we query the variable argument list for:
|
||||
// - the bszid_t of the blocksize we're about to process,
|
||||
// - the address of the blksz_t object, and
|
||||
// - the bszid_t of the multiple we need to associate with
|
||||
// the blksz_t object.
|
||||
// - the address of the blksz_t object,
|
||||
// - the bszid_t of the multiple, and
|
||||
// - the scalars we wish to apply to the real blocksizes to
|
||||
// come up with the induced complex blocksizes (for default
|
||||
// and maximum blocksizes).
|
||||
@@ -536,6 +573,7 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
// location within the context's blksz_t array. Do the same
|
||||
// for the blocksize multiple id.
|
||||
//cntx_blkszs[ bs_id ] = *blksz;
|
||||
//bli_blksz_copy_smart( blksz, cntx_blksz );
|
||||
bli_blksz_copy( blksz, cntx_blksz );
|
||||
|
||||
// Copy the blocksize multiple id into the context.
|
||||
@@ -624,14 +662,16 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
|
||||
bli_free_intl( dsclrs );
|
||||
bli_free_intl( msclrs );
|
||||
}
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_blksz( bszid_t bs_id,
|
||||
blksz_t* blksz,
|
||||
bszid_t mult_id,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_blksz
|
||||
(
|
||||
bszid_t bs_id,
|
||||
blksz_t* blksz,
|
||||
bszid_t mult_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
|
||||
bszid_t* bmults = bli_cntx_bmults_buf( cntx );
|
||||
@@ -645,20 +685,111 @@ void bli_cntx_set_blksz( bszid_t bs_id,
|
||||
bmults[ bs_id ] = mult_id;
|
||||
}
|
||||
|
||||
void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// Copy the function object into the specified location within
|
||||
// the context's virtual level-3 ukernel array.
|
||||
l3_vir_ukrs[ ukr_id ] = *func;
|
||||
void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
|
||||
{
|
||||
/* Example prototypes:
|
||||
|
||||
void bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
dim_t n_ukrs,
|
||||
l3ukr_t ukr0_id, num_t dt0, void* ukr0_fp, bool_t pref0,
|
||||
l3ukr_t ukr1_id, num_t dt1, void* ukr1_fp, bool_t pref1,
|
||||
l3ukr_t ukr2_id, num_t dt2, void* ukr2_fp, bool_t pref2,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
*/
|
||||
va_list args;
|
||||
dim_t i;
|
||||
|
||||
// Allocate some temporary local arrays.
|
||||
l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) );
|
||||
num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) );
|
||||
void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) );
|
||||
bool_t* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool_t ) );
|
||||
|
||||
// -- Begin variable argument section --
|
||||
|
||||
// Initialize variable argument environment.
|
||||
va_start( args, n_ukrs );
|
||||
|
||||
// Process n_ukrs tuples.
|
||||
for ( i = 0; i < n_ukrs; ++i )
|
||||
{
|
||||
// Here, we query the variable argument list for:
|
||||
// - the l3ukr_t of the kernel we're about to process,
|
||||
// - the datatype of the kernel,
|
||||
// - the kernel function pointer, and
|
||||
// - the kernel function storage preference
|
||||
// that we need to store to the context.
|
||||
const l3ukr_t ukr_id = va_arg( args, l3ukr_t );
|
||||
const num_t ukr_dt = va_arg( args, num_t );
|
||||
void* ukr_fp = va_arg( args, void* );
|
||||
const bool_t ukr_pref = va_arg( args, bool_t );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
ukr_ids[ i ] = ukr_id;
|
||||
ukr_dts[ i ] = ukr_dt;
|
||||
ukr_fps[ i ] = ukr_fp;
|
||||
ukr_prefs[ i ] = ukr_pref;
|
||||
}
|
||||
|
||||
// The last argument should be the context pointer.
|
||||
cntx_t* cntx = va_arg( args, cntx_t* );
|
||||
|
||||
// Shutdown variable argument environment and clean up stack.
|
||||
va_end( args );
|
||||
|
||||
// -- End variable argument section --
|
||||
|
||||
// Query the context for the addresses of:
|
||||
// - the l3 native ukernel func_t array
|
||||
// - the l3 native ukernel preferences array
|
||||
func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
|
||||
mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
|
||||
|
||||
// Now that we have the context address, we want to copy the values
|
||||
// from the temporary buffers into the corresponding buffers in the
|
||||
// context.
|
||||
|
||||
// Process each blocksize id tuple provided.
|
||||
for ( i = 0; i < n_ukrs; ++i )
|
||||
{
|
||||
// Read the current blocksize id, blksz_t* pointer, blocksize
|
||||
// multiple id, and blocksize scalar.
|
||||
const l3ukr_t ukr_id = ukr_ids[ i ];
|
||||
const num_t ukr_dt = ukr_dts[ i ];
|
||||
void* ukr_fp = ukr_fps[ i ];
|
||||
const bool_t ukr_pref = ukr_prefs[ i ];
|
||||
|
||||
// Index into the func_t and mbool_t for the current kernel id
|
||||
// being processed.
|
||||
func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ];
|
||||
mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ];
|
||||
|
||||
// Store the ukernel function pointer and preference values into
|
||||
// the context.
|
||||
bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
|
||||
bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
|
||||
}
|
||||
|
||||
// Free the temporary local arrays.
|
||||
bli_free_intl( ukr_ids );
|
||||
bli_free_intl( ukr_dts );
|
||||
bli_free_intl( ukr_fps );
|
||||
bli_free_intl( ukr_prefs );
|
||||
}
|
||||
|
||||
void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx )
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_l3_nat_ukr
|
||||
(
|
||||
l3ukr_t ukr_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
|
||||
|
||||
@@ -667,9 +798,12 @@ void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id,
|
||||
l3_nat_ukrs[ ukr_id ] = *func;
|
||||
}
|
||||
|
||||
void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id,
|
||||
mbool_t* prefs,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_l3_nat_ukr_prefs
|
||||
(
|
||||
l3ukr_t ukr_id,
|
||||
mbool_t* prefs,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
|
||||
|
||||
@@ -678,9 +812,26 @@ void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id,
|
||||
l3_nat_ukrs_prefs[ ukr_id ] = *prefs;
|
||||
}
|
||||
|
||||
void bli_cntx_set_l1f_ker( l1fkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_l3_vir_ukr
|
||||
(
|
||||
l3ukr_t ukr_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
|
||||
// Copy the function object into the specified location within
|
||||
// the context's virtual level-3 ukernel array.
|
||||
l3_vir_ukrs[ ukr_id ] = *func;
|
||||
}
|
||||
|
||||
void bli_cntx_set_l1f_ker
|
||||
(
|
||||
l1fkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx );
|
||||
|
||||
@@ -689,9 +840,12 @@ void bli_cntx_set_l1f_ker( l1fkr_t ker_id,
|
||||
l1f_kers[ ker_id ] = *func;
|
||||
}
|
||||
|
||||
void bli_cntx_set_l1v_ker( l1vkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_l1v_ker
|
||||
(
|
||||
l1vkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx );
|
||||
|
||||
@@ -700,43 +854,154 @@ void bli_cntx_set_l1v_ker( l1vkr_t ker_id,
|
||||
l1v_kers[ ker_id ] = *func;
|
||||
}
|
||||
|
||||
void bli_cntx_set_packm_ukr( func_t* func,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx );
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// Copy the function object into the context's packm ukernel object.
|
||||
*packm_ukrs = *func;
|
||||
void bli_cntx_set_packm_kers( dim_t n_kers, ... )
|
||||
{
|
||||
/* Example prototypes:
|
||||
|
||||
void bli_cntx_set_packm_kers
|
||||
(
|
||||
dim_t n_ukrs,
|
||||
l1mkr_t ker0_id, num_t ker0_dt, void* ker0_fp,
|
||||
l1mkr_t ker1_id, num_t ker1_dt, void* ker1_fp,
|
||||
l1mkr_t ker2_id, num_t ker2_dt, void* ker2_fp,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
*/
|
||||
va_list args;
|
||||
dim_t i;
|
||||
|
||||
// Allocate some temporary local arrays.
|
||||
l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ) );
|
||||
num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) );
|
||||
void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) );
|
||||
|
||||
// -- Begin variable argument section --
|
||||
|
||||
// Initialize variable argument environment.
|
||||
va_start( args, n_kers );
|
||||
|
||||
// Process n_kers tuples.
|
||||
for ( i = 0; i < n_kers; ++i )
|
||||
{
|
||||
// Here, we query the variable argument list for:
|
||||
// - the l1mkr_t of the kernel we're about to process,
|
||||
// - the datatype of the kernel, and
|
||||
// - the kernel function pointer
|
||||
// that we need to store to the context.
|
||||
const l1mkr_t ker_id = va_arg( args, l1mkr_t );
|
||||
const num_t ker_dt = va_arg( args, num_t );
|
||||
void* ker_fp = va_arg( args, void* );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
ker_ids[ i ] = ker_id;
|
||||
ker_dts[ i ] = ker_dt;
|
||||
ker_fps[ i ] = ker_fp;
|
||||
}
|
||||
|
||||
// The last argument should be the context pointer.
|
||||
cntx_t* cntx = va_arg( args, cntx_t* );
|
||||
|
||||
// Shutdown variable argument environment and clean up stack.
|
||||
va_end( args );
|
||||
|
||||
// -- End variable argument section --
|
||||
|
||||
// Query the context for the address of:
|
||||
// - the packm kernels func_t array
|
||||
func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx );
|
||||
|
||||
// Now that we have the context address, we want to copy the values
|
||||
// from the temporary buffers into the corresponding buffers in the
|
||||
// context.
|
||||
|
||||
// Process each blocksize id tuple provided.
|
||||
for ( i = 0; i < n_kers; ++i )
|
||||
{
|
||||
// Read the current blocksize id, blksz_t* pointer, blocksize
|
||||
// multiple id, and blocksize scalar.
|
||||
const l1mkr_t ker_id = ker_ids[ i ];
|
||||
const num_t ker_dt = ker_dts[ i ];
|
||||
void* ker_fp = ker_fps[ i ];
|
||||
|
||||
// Index into the func_t and mbool_t for the current kernel id
|
||||
// being processed.
|
||||
func_t* kers = &cntx_packm_kers[ ker_id ];
|
||||
|
||||
// Store the ukernel function pointer and preference values into
|
||||
// the context.
|
||||
bli_func_set_dt( ker_fp, ker_dt, kers );
|
||||
}
|
||||
|
||||
// Free the temporary local arrays.
|
||||
bli_free_intl( ker_ids );
|
||||
bli_free_intl( ker_dts );
|
||||
bli_free_intl( ker_fps );
|
||||
}
|
||||
|
||||
void bli_cntx_set_ind_method( ind_t method,
|
||||
cntx_t* cntx )
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_packm_ker
|
||||
(
|
||||
l1mkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
func_t* packm_kers = bli_cntx_packm_kers_buf( cntx );
|
||||
|
||||
// Copy the function object into the specified location within
|
||||
// the context's packm kernel array.
|
||||
packm_kers[ ker_id ] = *func;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_ind_method
|
||||
(
|
||||
ind_t method,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bli_cntx_set_method( method, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_ab_blockpanel
|
||||
(
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bli_cntx_set_schema_a_block( schema_a, cntx );
|
||||
bli_cntx_set_schema_b_panel( schema_b, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_a_block
|
||||
(
|
||||
pack_t schema_a,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bli_cntx_set_schema_a_block( schema_a, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_b_panel
|
||||
(
|
||||
pack_t schema_b,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bli_cntx_set_schema_b_panel( schema_b, cntx );
|
||||
}
|
||||
|
||||
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
|
||||
cntx_t* cntx )
|
||||
void bli_cntx_set_pack_schema_c_panel
|
||||
(
|
||||
pack_t schema_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bli_cntx_set_schema_c_panel( schema_c, cntx );
|
||||
}
|
||||
@@ -749,17 +1014,24 @@ void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
|
||||
}
|
||||
#endif
|
||||
|
||||
void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
|
||||
dim_t m, dim_t n, dim_t k )
|
||||
void bli_cntx_set_thrloop_from_env
|
||||
(
|
||||
opid_t l3_op,
|
||||
side_t side,
|
||||
cntx_t* cntx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k
|
||||
)
|
||||
{
|
||||
dim_t jc, pc, ic, jr, ir;
|
||||
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
int nthread = bli_env_read_nway( "BLIS_NUM_THREADS", -1 );
|
||||
int nthread = bli_thread_get_env( "BLIS_NUM_THREADS", -1 );
|
||||
|
||||
if ( nthread == -1 )
|
||||
nthread = bli_env_read_nway( "OMP_NUM_THREADS", -1 );
|
||||
nthread = bli_thread_get_env( "OMP_NUM_THREADS", -1 );
|
||||
|
||||
if ( nthread < 1 ) nthread = 1;
|
||||
|
||||
@@ -786,10 +1058,10 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
|
||||
|
||||
pc = 1;
|
||||
|
||||
dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 );
|
||||
dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 );
|
||||
dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 );
|
||||
dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 );
|
||||
dim_t jc_env = bli_thread_get_env( "BLIS_JC_NT", -1 );
|
||||
dim_t ic_env = bli_thread_get_env( "BLIS_IC_NT", -1 );
|
||||
dim_t jr_env = bli_thread_get_env( "BLIS_JR_NT", -1 );
|
||||
dim_t ir_env = bli_thread_get_env( "BLIS_IR_NT", -1 );
|
||||
|
||||
if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1)
|
||||
{
|
||||
@@ -882,9 +1154,12 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt
|
||||
(
|
||||
num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx );
|
||||
bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs );
|
||||
@@ -894,9 +1169,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt,
|
||||
return ukr_prefs == TRUE;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt
|
||||
(
|
||||
num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx );
|
||||
bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs );
|
||||
@@ -906,16 +1184,22 @@ bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt,
|
||||
return ukr_prefs == FALSE;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_nat_ukr_prefers_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
return !bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = bli_obj_datatype( *obj );
|
||||
const bool_t ukr_prefers_rows
|
||||
@@ -930,9 +1214,12 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
@@ -942,9 +1229,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
@@ -956,9 +1246,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_ukr_prefers_rows_dt
|
||||
(
|
||||
num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
// Reference the ukr storage preferences of the corresponding real
|
||||
// micro-kernel for induced methods.
|
||||
@@ -968,9 +1261,12 @@ bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
|
||||
return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_ukr_prefers_cols_dt
|
||||
(
|
||||
num_t dt,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
// Reference the ukr storage preferences of the corresponding real
|
||||
// micro-kernel for induced methods.
|
||||
@@ -980,16 +1276,22 @@ bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt,
|
||||
return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_ukr_prefers_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_ukr_dislikes_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *obj );
|
||||
|
||||
@@ -1005,9 +1307,12 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
@@ -1017,9 +1322,12 @@ bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
|
||||
return r_val;
|
||||
}
|
||||
|
||||
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx )
|
||||
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of
|
||||
(
|
||||
obj_t* obj,
|
||||
l3ukr_t ukr_id,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
|
||||
|
||||
@@ -1108,23 +1416,6 @@ void bli_cntx_print( cntx_t* cntx )
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
func_t* ukr = bli_cntx_get_packm_ukr( cntx );
|
||||
|
||||
printf( "packm ker : %16p %16p %16p %16p\n",
|
||||
bli_func_get_dt( BLIS_FLOAT, ukr ),
|
||||
bli_func_get_dt( BLIS_DOUBLE, ukr ),
|
||||
bli_func_get_dt( BLIS_SCOMPLEX, ukr ),
|
||||
bli_func_get_dt( BLIS_DCOMPLEX, ukr )
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
ind_t family = bli_cntx_get_family( cntx );
|
||||
|
||||
printf( "oper family : %lu\n", ( guint_t )family );
|
||||
}
|
||||
|
||||
{
|
||||
ind_t method = bli_cntx_get_ind_method( cntx );
|
||||
|
||||
|
||||
@@ -36,6 +36,9 @@
|
||||
#ifndef BLIS_CNTX_H
|
||||
#define BLIS_CNTX_H
|
||||
|
||||
//#include "bli_cntx_init.h"
|
||||
|
||||
|
||||
// Context object type (defined in bli_type_defs.h)
|
||||
|
||||
/*
|
||||
@@ -51,9 +54,9 @@ typedef struct cntx_s
|
||||
func_t* l1f_kers;
|
||||
func_t* l1v_kers;
|
||||
|
||||
func_t packm_ukrs;
|
||||
func_t* packm_kers;
|
||||
func_t* unpackm_kers;
|
||||
|
||||
opid_t family;
|
||||
ind_t method;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
@@ -99,17 +102,13 @@ typedef struct cntx_s
|
||||
\
|
||||
( (cntx)->l1v_kers )
|
||||
|
||||
#define bli_cntx_packm_ukrs_buf( cntx ) \
|
||||
#define bli_cntx_packm_kers_buf( cntx ) \
|
||||
\
|
||||
(&((cntx)->packm_ukrs) )
|
||||
( (cntx)->packm_kers )
|
||||
|
||||
#define bli_cntx_packm_ukrs( cntx ) \
|
||||
#define bli_cntx_unpackm_kers_buf( cntx ) \
|
||||
\
|
||||
(&((cntx)->packm_ukrs) )
|
||||
|
||||
#define bli_cntx_family( cntx ) \
|
||||
\
|
||||
( (cntx)->family )
|
||||
( (cntx)->unpackm_kers )
|
||||
|
||||
#define bli_cntx_method( cntx ) \
|
||||
\
|
||||
@@ -202,16 +201,6 @@ typedef struct cntx_s
|
||||
(cntx_p)->l1v_kers = _l1v_kers; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->packm_ukrs = _packm_ukrs; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_family( _family, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->family = _family; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_method( _method, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->method = _method; \
|
||||
@@ -285,7 +274,8 @@ typedef struct cntx_s
|
||||
( \
|
||||
(dt), \
|
||||
&(( \
|
||||
bli_cntx_method( (cntx) ) != BLIS_NAT \
|
||||
bli_cntx_method( (cntx) ) != BLIS_NAT && \
|
||||
bli_is_complex( dt ) \
|
||||
? bli_cntx_l3_vir_ukrs_buf( (cntx) ) \
|
||||
: bli_cntx_l3_nat_ukrs_buf( (cntx) ) \
|
||||
)[ ukr_id ]) \
|
||||
@@ -326,10 +316,6 @@ typedef struct cntx_s
|
||||
(dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \
|
||||
)
|
||||
|
||||
#define bli_cntx_get_family( cntx ) \
|
||||
\
|
||||
bli_cntx_family( cntx )
|
||||
|
||||
#define bli_cntx_get_ind_method( cntx ) \
|
||||
\
|
||||
bli_cntx_method( cntx )
|
||||
@@ -357,9 +343,9 @@ typedef struct cntx_s
|
||||
|
||||
// create/free
|
||||
|
||||
//void bli_cntx_obj_create( cntx_t* cntx );
|
||||
//void bli_cntx_obj_free( cntx_t* cntx );
|
||||
void bli_cntx_obj_clear( cntx_t* cntx );
|
||||
//void bli_cntx_create( cntx_t* cntx );
|
||||
//void bli_cntx_free( cntx_t* cntx );
|
||||
void bli_cntx_clear( cntx_t* cntx );
|
||||
void bli_cntx_init( cntx_t* cntx );
|
||||
|
||||
// get functions
|
||||
@@ -380,7 +366,7 @@ func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id,
|
||||
cntx_t* cntx );
|
||||
func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id,
|
||||
cntx_t* cntx );
|
||||
func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
|
||||
//func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
|
||||
|
||||
//dim_t bli_cntx_get_blksz_def_dt( num_t dt,
|
||||
// bszid_t bs_id,
|
||||
@@ -409,6 +395,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
|
||||
//void* bli_cntx_get_l1v_ker_dt( num_t dt,
|
||||
// l1vkr_t ker_id,
|
||||
// cntx_t* cntx );
|
||||
func_t* bli_cntx_get_packm_ker( l1mkr_t ker_id,
|
||||
cntx_t* cntx );
|
||||
func_t* bli_cntx_get_unpackm_ker( l1mkr_t ker_id,
|
||||
cntx_t* cntx );
|
||||
//ind_t bli_cntx_get_ind_method( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx );
|
||||
//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx );
|
||||
@@ -425,18 +415,34 @@ void bli_cntx_set_blksz( bszid_t bs_id,
|
||||
blksz_t* blksz,
|
||||
bszid_t mult_id,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx );
|
||||
|
||||
void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
|
||||
|
||||
void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id,
|
||||
mbool_t* prefs,
|
||||
cntx_t* cntx );
|
||||
|
||||
void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx );
|
||||
|
||||
void bli_cntx_set_l1f_ker( l1fkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx );
|
||||
|
||||
void bli_cntx_set_l1v_ker( l1vkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx );
|
||||
|
||||
void bli_cntx_set_packm_kers( dim_t n_kers, ... );
|
||||
|
||||
void bli_cntx_set_packm_ker( l1mkr_t ker_id,
|
||||
func_t* func,
|
||||
cntx_t* cntx );
|
||||
|
||||
void bli_cntx_set_packm_ukr( func_t* func,
|
||||
cntx_t* cntx );
|
||||
void bli_cntx_set_ind_method( ind_t method,
|
||||
@@ -507,11 +513,11 @@ void bli_cntx_print( cntx_t* cntx );
|
||||
// Preprocess out these calls entirely, since they are currently just empty
|
||||
// functions that do nothing.
|
||||
#if 0
|
||||
#define bli_cntx_obj_create( cntx ) { bli_cntx_obj_clear( cntx ); }
|
||||
#define bli_cntx_obj_free( cntx ) { bli_cntx_obj_clear( cntx ); }
|
||||
#define bli_cntx_create( cntx ) { bli_cntx_clear( cntx ); }
|
||||
#define bli_cntx_free( cntx ) { bli_cntx_clear( cntx ); }
|
||||
#else
|
||||
#define bli_cntx_obj_create( cntx ) { ; }
|
||||
#define bli_cntx_obj_free( cntx ) { ; }
|
||||
#define bli_cntx_create( cntx ) { ; }
|
||||
#define bli_cntx_free( cntx ) { ; }
|
||||
#endif
|
||||
|
||||
// These macros initialize/finalize a local context if the given context
|
||||
|
||||
@@ -35,37 +35,57 @@
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
func_t* bli_func_obj_create( void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z )
|
||||
func_t* bli_func_create
|
||||
(
|
||||
void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z
|
||||
)
|
||||
{
|
||||
func_t* f;
|
||||
|
||||
f = ( func_t* ) bli_malloc_intl( sizeof(func_t) );
|
||||
|
||||
bli_func_obj_init( f,
|
||||
ptr_s,
|
||||
ptr_d,
|
||||
ptr_c,
|
||||
ptr_z );
|
||||
bli_func_init
|
||||
(
|
||||
f,
|
||||
ptr_s,
|
||||
ptr_d,
|
||||
ptr_c,
|
||||
ptr_z
|
||||
);
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
void bli_func_obj_init( func_t* f,
|
||||
void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z )
|
||||
void bli_func_init
|
||||
(
|
||||
func_t* f,
|
||||
void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z
|
||||
)
|
||||
{
|
||||
f->ptr[BLIS_BITVAL_FLOAT_TYPE] = ptr_s;
|
||||
f->ptr[BLIS_BITVAL_DOUBLE_TYPE] = ptr_d;
|
||||
f->ptr[BLIS_BITVAL_SCOMPLEX_TYPE] = ptr_c;
|
||||
f->ptr[BLIS_BITVAL_DCOMPLEX_TYPE] = ptr_z;
|
||||
bli_func_set_dt( ptr_s, BLIS_FLOAT, f );
|
||||
bli_func_set_dt( ptr_d, BLIS_DOUBLE, f );
|
||||
bli_func_set_dt( ptr_c, BLIS_SCOMPLEX, f );
|
||||
bli_func_set_dt( ptr_z, BLIS_DCOMPLEX, f );
|
||||
}
|
||||
|
||||
void bli_func_obj_free( func_t* f )
|
||||
void bli_func_init_null
|
||||
(
|
||||
func_t* f
|
||||
)
|
||||
{
|
||||
bli_func_set_dt( NULL, BLIS_FLOAT, f );
|
||||
bli_func_set_dt( NULL, BLIS_DOUBLE, f );
|
||||
bli_func_set_dt( NULL, BLIS_SCOMPLEX, f );
|
||||
bli_func_set_dt( NULL, BLIS_DCOMPLEX, f );
|
||||
}
|
||||
|
||||
void bli_func_free( func_t* f )
|
||||
{
|
||||
bli_free_intl( f );
|
||||
}
|
||||
@@ -75,7 +95,7 @@ void bli_func_obj_free( func_t* f )
|
||||
bool_t bli_func_is_null_dt( num_t dt,
|
||||
func_t* f )
|
||||
{
|
||||
return ( f->ptr[ dt ] == NULL );
|
||||
return ( bli_func_get_dt( dt, f ) == NULL );
|
||||
}
|
||||
|
||||
bool_t bli_func_is_null( func_t* f )
|
||||
@@ -87,7 +107,7 @@ bool_t bli_func_is_null( func_t* f )
|
||||
// return FALSE. Otherwise, if they are all null, return TRUE.
|
||||
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
|
||||
{
|
||||
if ( f->ptr[ dt ] != NULL )
|
||||
if ( bli_func_get_dt( dt, f ) != NULL )
|
||||
{
|
||||
r_val = FALSE;
|
||||
break;
|
||||
|
||||
@@ -49,18 +49,29 @@
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
func_t* bli_func_obj_create( void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z );
|
||||
func_t* bli_func_create
|
||||
(
|
||||
void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z
|
||||
);
|
||||
|
||||
void bli_func_obj_init( func_t* f,
|
||||
void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z );
|
||||
void bli_func_init
|
||||
(
|
||||
func_t* f,
|
||||
void* ptr_s,
|
||||
void* ptr_d,
|
||||
void* ptr_c,
|
||||
void* ptr_z
|
||||
);
|
||||
|
||||
void bli_func_obj_free( func_t* f );
|
||||
void bli_func_init_null
|
||||
(
|
||||
func_t* f
|
||||
);
|
||||
|
||||
void bli_func_free( func_t* f );
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -74,12 +74,6 @@ static blksz_t bli_gks_blkszs[BLIS_NUM_BLKSZS] =
|
||||
/* df */ { { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, },
|
||||
{ BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, }
|
||||
},
|
||||
/* xf */ { { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, },
|
||||
{ BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, }
|
||||
},
|
||||
/* vf */ { { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, },
|
||||
{ BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, }
|
||||
},
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -35,29 +35,38 @@
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
mbool_t* bli_mbool_obj_create( bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z )
|
||||
mbool_t* bli_mbool_create
|
||||
(
|
||||
bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z
|
||||
)
|
||||
{
|
||||
mbool_t* b;
|
||||
|
||||
b = ( mbool_t* ) bli_malloc_intl( sizeof(mbool_t) );
|
||||
|
||||
bli_mbool_obj_init( b,
|
||||
b_s,
|
||||
b_d,
|
||||
b_c,
|
||||
b_z );
|
||||
bli_mbool_init
|
||||
(
|
||||
b,
|
||||
b_s,
|
||||
b_d,
|
||||
b_c,
|
||||
b_z
|
||||
);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
void bli_mbool_obj_init( mbool_t* b,
|
||||
bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z )
|
||||
void bli_mbool_init
|
||||
(
|
||||
mbool_t* b,
|
||||
bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z
|
||||
)
|
||||
{
|
||||
bli_mbool_set_dt( b_s, BLIS_FLOAT, b );
|
||||
bli_mbool_set_dt( b_d, BLIS_DOUBLE, b );
|
||||
@@ -65,7 +74,7 @@ void bli_mbool_obj_init( mbool_t* b,
|
||||
bli_mbool_set_dt( b_z, BLIS_DCOMPLEX, b );
|
||||
}
|
||||
|
||||
void bli_mbool_obj_free( mbool_t* b )
|
||||
void bli_mbool_free( mbool_t* b )
|
||||
{
|
||||
bli_free_intl( b );
|
||||
}
|
||||
|
||||
@@ -49,16 +49,22 @@
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
mbool_t* bli_mbool_obj_create( bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z );
|
||||
mbool_t* bli_mbool_create
|
||||
(
|
||||
bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z
|
||||
);
|
||||
|
||||
void bli_mbool_obj_init( mbool_t* b,
|
||||
bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z );
|
||||
void bli_mbool_init
|
||||
(
|
||||
mbool_t* b,
|
||||
bool_t b_s,
|
||||
bool_t b_d,
|
||||
bool_t b_c,
|
||||
bool_t b_z
|
||||
);
|
||||
|
||||
void bli_mbool_obj_free( mbool_t* b );
|
||||
void bli_mbool_free( mbool_t* b );
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@ void bli_membrk_init
|
||||
bli_mutex_init( bli_membrk_mutex( membrk ) );
|
||||
bli_membrk_init_pools( cntx, membrk );
|
||||
bli_membrk_set_malloc_fp( bli_malloc_pool, membrk );
|
||||
bli_membrk_set_free_fp( bli_free_pool, membrk );
|
||||
}
|
||||
|
||||
void bli_membrk_finalize
|
||||
|
||||
@@ -41,7 +41,12 @@
|
||||
-lf2c -lm (in that order)
|
||||
*/
|
||||
|
||||
bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len)
|
||||
|
||||
#ifdef LAPACK_ILP64
|
||||
long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len)
|
||||
#else
|
||||
int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len)
|
||||
#endif
|
||||
{
|
||||
/* System generated locals */
|
||||
bla_logical ret_val;
|
||||
@@ -115,11 +120,11 @@ bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, f
|
||||
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
|
||||
/* upper case 'Z'. */
|
||||
|
||||
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta
|
||||
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta
|
||||
>= 162 && inta <= 169)) {
|
||||
inta += 64;
|
||||
}
|
||||
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb
|
||||
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb
|
||||
>= 162 && intb <= 169)) {
|
||||
intb += 64;
|
||||
}
|
||||
|
||||
@@ -34,6 +34,10 @@
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len);
|
||||
#ifdef LAPACK_ILP64
|
||||
long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len);
|
||||
#else
|
||||
int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <float.h>
|
||||
#include <errno.h>
|
||||
|
||||
// Determine if we are on a 64-bit or 32-bit architecture
|
||||
#if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \
|
||||
@@ -66,6 +67,8 @@
|
||||
#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
|
||||
defined(__bsdi__) || defined(__DragonFly__)
|
||||
#define BLIS_OS_BSD 1
|
||||
#elif defined(EMSCRIPTEN)
|
||||
#define BLIS_OS_EMSCRIPTEN
|
||||
#else
|
||||
#error "Cannot determine operating system"
|
||||
#endif
|
||||
|
||||
@@ -438,7 +438,7 @@ typedef enum
|
||||
BLIS_INT = BLIS_BITVAL_INT_TYPE,
|
||||
BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE,
|
||||
BLIS_DT_LO = BLIS_FLOAT,
|
||||
BLIS_DT_HI = BLIS_DCOMPLEX,
|
||||
BLIS_DT_HI = BLIS_DCOMPLEX
|
||||
} num_t;
|
||||
|
||||
typedef enum
|
||||
@@ -482,7 +482,7 @@ typedef enum
|
||||
BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E,
|
||||
BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E,
|
||||
BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R,
|
||||
BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R,
|
||||
BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R
|
||||
} pack_t;
|
||||
|
||||
// We combine row and column packing into one "type", and we start
|
||||
@@ -511,7 +511,7 @@ typedef enum
|
||||
BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
|
||||
BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL,
|
||||
BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL,
|
||||
BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE,
|
||||
BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE
|
||||
} packbuf_t;
|
||||
|
||||
|
||||
@@ -590,7 +590,7 @@ typedef enum
|
||||
BLIS_4M1B,
|
||||
BLIS_4M1A,
|
||||
BLIS_1M,
|
||||
BLIS_NAT,
|
||||
BLIS_NAT
|
||||
} ind_t;
|
||||
|
||||
#define BLIS_NUM_IND_METHODS (BLIS_NAT+1)
|
||||
@@ -613,7 +613,7 @@ typedef enum
|
||||
BLIS_SETV_KER,
|
||||
BLIS_SUBV_KER,
|
||||
BLIS_SWAPV_KER,
|
||||
BLIS_XPBYV_KER,
|
||||
BLIS_XPBYV_KER
|
||||
} l1vkr_t;
|
||||
|
||||
#define BLIS_NUM_LEVEL1V_KERS 14
|
||||
@@ -625,19 +625,93 @@ typedef enum
|
||||
BLIS_DOTAXPYV_KER,
|
||||
BLIS_AXPYF_KER,
|
||||
BLIS_DOTXF_KER,
|
||||
BLIS_DOTXAXPYF_KER,
|
||||
BLIS_DOTXAXPYF_KER
|
||||
} l1fkr_t;
|
||||
|
||||
#define BLIS_NUM_LEVEL1F_KERS 5
|
||||
|
||||
|
||||
typedef enum
|
||||
{
|
||||
BLIS_PACKM_0XK_KER = 0,
|
||||
BLIS_PACKM_1XK_KER = 1,
|
||||
BLIS_PACKM_2XK_KER = 2,
|
||||
BLIS_PACKM_3XK_KER = 3,
|
||||
BLIS_PACKM_4XK_KER = 4,
|
||||
BLIS_PACKM_5XK_KER = 5,
|
||||
BLIS_PACKM_6XK_KER = 6,
|
||||
BLIS_PACKM_7XK_KER = 7,
|
||||
BLIS_PACKM_8XK_KER = 8,
|
||||
BLIS_PACKM_9XK_KER = 9,
|
||||
BLIS_PACKM_10XK_KER = 10,
|
||||
BLIS_PACKM_11XK_KER = 11,
|
||||
BLIS_PACKM_12XK_KER = 12,
|
||||
BLIS_PACKM_13XK_KER = 13,
|
||||
BLIS_PACKM_14XK_KER = 14,
|
||||
BLIS_PACKM_15XK_KER = 15,
|
||||
BLIS_PACKM_16XK_KER = 16,
|
||||
BLIS_PACKM_17XK_KER = 17,
|
||||
BLIS_PACKM_18XK_KER = 18,
|
||||
BLIS_PACKM_19XK_KER = 19,
|
||||
BLIS_PACKM_20XK_KER = 20,
|
||||
BLIS_PACKM_21XK_KER = 21,
|
||||
BLIS_PACKM_22XK_KER = 22,
|
||||
BLIS_PACKM_23XK_KER = 23,
|
||||
BLIS_PACKM_24XK_KER = 24,
|
||||
BLIS_PACKM_25XK_KER = 25,
|
||||
BLIS_PACKM_26XK_KER = 26,
|
||||
BLIS_PACKM_27XK_KER = 27,
|
||||
BLIS_PACKM_28XK_KER = 28,
|
||||
BLIS_PACKM_29XK_KER = 29,
|
||||
BLIS_PACKM_30XK_KER = 30,
|
||||
BLIS_PACKM_31XK_KER = 31,
|
||||
|
||||
BLIS_UNPACKM_0XK_KER = 0,
|
||||
BLIS_UNPACKM_1XK_KER = 1,
|
||||
BLIS_UNPACKM_2XK_KER = 2,
|
||||
BLIS_UNPACKM_3XK_KER = 3,
|
||||
BLIS_UNPACKM_4XK_KER = 4,
|
||||
BLIS_UNPACKM_5XK_KER = 5,
|
||||
BLIS_UNPACKM_6XK_KER = 6,
|
||||
BLIS_UNPACKM_7XK_KER = 7,
|
||||
BLIS_UNPACKM_8XK_KER = 8,
|
||||
BLIS_UNPACKM_9XK_KER = 9,
|
||||
BLIS_UNPACKM_10XK_KER = 10,
|
||||
BLIS_UNPACKM_11XK_KER = 11,
|
||||
BLIS_UNPACKM_12XK_KER = 12,
|
||||
BLIS_UNPACKM_13XK_KER = 13,
|
||||
BLIS_UNPACKM_14XK_KER = 14,
|
||||
BLIS_UNPACKM_15XK_KER = 15,
|
||||
BLIS_UNPACKM_16XK_KER = 16,
|
||||
BLIS_UNPACKM_17XK_KER = 17,
|
||||
BLIS_UNPACKM_18XK_KER = 18,
|
||||
BLIS_UNPACKM_19XK_KER = 19,
|
||||
BLIS_UNPACKM_20XK_KER = 20,
|
||||
BLIS_UNPACKM_21XK_KER = 21,
|
||||
BLIS_UNPACKM_22XK_KER = 22,
|
||||
BLIS_UNPACKM_23XK_KER = 23,
|
||||
BLIS_UNPACKM_24XK_KER = 24,
|
||||
BLIS_UNPACKM_25XK_KER = 25,
|
||||
BLIS_UNPACKM_26XK_KER = 26,
|
||||
BLIS_UNPACKM_27XK_KER = 27,
|
||||
BLIS_UNPACKM_28XK_KER = 28,
|
||||
BLIS_UNPACKM_29XK_KER = 29,
|
||||
BLIS_UNPACKM_30XK_KER = 30,
|
||||
BLIS_UNPACKM_31XK_KER = 31
|
||||
|
||||
} l1mkr_t;
|
||||
|
||||
#define BLIS_NUM_PACKM_KERS 32
|
||||
#define BLIS_NUM_UNPACKM_KERS 32
|
||||
|
||||
|
||||
typedef enum
|
||||
{
|
||||
BLIS_GEMM_UKR = 0,
|
||||
BLIS_GEMMTRSM_L_UKR,
|
||||
BLIS_GEMMTRSM_U_UKR,
|
||||
BLIS_TRSM_L_UKR,
|
||||
BLIS_TRSM_U_UKR,
|
||||
BLIS_TRSM_U_UKR
|
||||
} l3ukr_t;
|
||||
|
||||
#define BLIS_NUM_LEVEL3_UKRS 5
|
||||
@@ -648,7 +722,7 @@ typedef enum
|
||||
BLIS_REFERENCE_UKERNEL = 0,
|
||||
BLIS_VIRTUAL_UKERNEL,
|
||||
BLIS_OPTIMIZED_UKERNEL,
|
||||
BLIS_NOTAPPLIC_UKERNEL,
|
||||
BLIS_NOTAPPLIC_UKERNEL
|
||||
} kimpl_t;
|
||||
|
||||
#define BLIS_NUM_UKR_IMPL_TYPES 4
|
||||
@@ -662,7 +736,7 @@ typedef enum
|
||||
BLIS_IC_IDX,
|
||||
BLIS_JR_IDX,
|
||||
BLIS_IR_IDX,
|
||||
BLIS_PR_IDX,
|
||||
BLIS_PR_IDX
|
||||
} thridx_t;
|
||||
#endif
|
||||
|
||||
@@ -683,7 +757,7 @@ typedef enum
|
||||
// value that can be subtracted from the opid_t value to map it
|
||||
// to a zero-based range.
|
||||
// This is needed because these level-3 opid_t values are used in
|
||||
// bli_ind_query.c to index into arrays.
|
||||
// bli_l3_ind.c to index into arrays.
|
||||
//
|
||||
BLIS_GEMM = 0,
|
||||
BLIS_HEMM,
|
||||
@@ -696,7 +770,7 @@ typedef enum
|
||||
BLIS_TRMM,
|
||||
BLIS_TRSM,
|
||||
|
||||
BLIS_NOID,
|
||||
BLIS_NOID
|
||||
} opid_t;
|
||||
|
||||
#define BLIS_NUM_LEVEL3_OPS 10
|
||||
@@ -714,16 +788,14 @@ typedef enum
|
||||
BLIS_NC,
|
||||
BLIS_M2, // level-2 blocksize in m dimension
|
||||
BLIS_N2, // level-2 blocksize in n dimension
|
||||
BLIS_1F, // level-1f global fusing factor
|
||||
BLIS_AF, // level-1f axpyf fusing factor
|
||||
BLIS_DF, // level-1f dotxf fusing factor
|
||||
BLIS_XF, // level-1f dotxaxpyf fusing factor
|
||||
BLIS_VF, // level-1v vector fusing factor
|
||||
|
||||
BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable.
|
||||
BLIS_NO_PART // used as a placeholder when blocksizes are not applicable.
|
||||
} bszid_t;
|
||||
|
||||
#define BLIS_NUM_BLKSZS 13
|
||||
#define BLIS_NUM_BLKSZS 11
|
||||
|
||||
|
||||
//
|
||||
@@ -784,6 +856,7 @@ typedef struct mem_s
|
||||
struct cntl_s
|
||||
{
|
||||
// Basic fields (usually required).
|
||||
opid_t family;
|
||||
bszid_t bszid;
|
||||
void* var_func;
|
||||
struct cntl_s* sub_node;
|
||||
@@ -971,9 +1044,9 @@ typedef struct cntx_s
|
||||
func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ];
|
||||
func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ];
|
||||
|
||||
func_t packm_ukrs;
|
||||
func_t packm_kers[ BLIS_NUM_PACKM_KERS ];
|
||||
func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ];
|
||||
|
||||
opid_t family;
|
||||
ind_t method;
|
||||
pack_t schema_a_block;
|
||||
pack_t schema_b_panel;
|
||||
@@ -992,7 +1065,7 @@ typedef struct cntx_s
|
||||
typedef enum
|
||||
{
|
||||
BLIS_NO_ERROR_CHECKING = 0,
|
||||
BLIS_FULL_ERROR_CHECKING,
|
||||
BLIS_FULL_ERROR_CHECKING
|
||||
} errlev_t;
|
||||
|
||||
typedef enum
|
||||
|
||||
@@ -122,7 +122,7 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_3M1;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -170,7 +170,7 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_3M2;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -218,7 +218,7 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_3M3;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -279,7 +279,7 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_3MH;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -343,7 +343,7 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_4M1A;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -391,7 +391,7 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_4M1B;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -439,7 +439,7 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_4MH;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -524,7 +524,7 @@ void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx )
|
||||
const ind_t method = BLIS_1M;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
|
||||
@@ -41,7 +41,7 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_3M1;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -89,7 +89,7 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_4M1A;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
@@ -137,7 +137,7 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
|
||||
const ind_t method = BLIS_1M;
|
||||
|
||||
// Clear the context fields.
|
||||
bli_cntx_obj_clear( cntx );
|
||||
bli_cntx_clear( cntx );
|
||||
|
||||
// Initialize the context with the current architecture's native
|
||||
// level-3 gemm micro-kernel, and its output preferences.
|
||||
|
||||
@@ -36,19 +36,82 @@
|
||||
|
||||
void* bli_thrcomm_bcast
|
||||
(
|
||||
thrcomm_t* communicator,
|
||||
thrcomm_t* comm,
|
||||
dim_t id,
|
||||
void* to_send
|
||||
)
|
||||
{
|
||||
if ( communicator == NULL || communicator->n_threads == 1 ) return to_send;
|
||||
if ( comm == NULL || comm->n_threads == 1 ) return to_send;
|
||||
|
||||
if ( id == 0 ) communicator->sent_object = to_send;
|
||||
if ( id == 0 ) comm->sent_object = to_send;
|
||||
|
||||
bli_thrcomm_barrier( communicator, id );
|
||||
void* object = communicator->sent_object;
|
||||
bli_thrcomm_barrier( communicator, id );
|
||||
bli_thrcomm_barrier( comm, id );
|
||||
void* object = comm->sent_object;
|
||||
bli_thrcomm_barrier( comm, id );
|
||||
|
||||
return object;
|
||||
}
|
||||
|
||||
// Use __sync_* builtins (assumed available) if __atomic_* ones are not present.
|
||||
#ifndef __ATOMIC_RELAXED
|
||||
|
||||
#define __ATOMIC_RELAXED
|
||||
#define __ATOMIC_ACQUIRE
|
||||
#define __ATOMIC_RELEASE
|
||||
#define __ATOMIC_ACQ_REL
|
||||
|
||||
#define __atomic_load_n(ptr, constraint) \
|
||||
__sync_fetch_and_add(ptr, 0)
|
||||
#define __atomic_add_fetch(ptr, value, constraint) \
|
||||
__sync_add_and_fetch(ptr, value)
|
||||
#define __atomic_fetch_add(ptr, value, constraint) \
|
||||
__sync_fetch_and_add(ptr, value)
|
||||
#define __atomic_fetch_xor(ptr, value, constraint) \
|
||||
__sync_fetch_and_xor(ptr, value)
|
||||
|
||||
#endif
|
||||
|
||||
void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id )
|
||||
{
|
||||
// Return early if the comm is NULL or if there is only one
|
||||
// thread participating.
|
||||
if ( comm == NULL || comm->n_threads == 1 ) return;
|
||||
|
||||
// Read the "sense" variable. This variable is akin to a unique ID for
|
||||
// the current barrier. The first n-1 threads will spin on this variable
|
||||
// until it changes. The sense variable gets incremented by the last
|
||||
// thread to enter the barrier, just before it exits. But it turns out
|
||||
// that you don't need many unique IDs before you can wrap around. In
|
||||
// fact, if everything else is working, a binary variable is sufficient,
|
||||
// which is what we do here (i.e., 0 is incremented to 1, which is then
|
||||
// decremented back to 0, and so forth).
|
||||
bool_t orig_sense = __atomic_load_n( &comm->barrier_sense, __ATOMIC_RELAXED );
|
||||
|
||||
// Register ourselves (the current thread) as having arrived by
|
||||
// incrementing the barrier_threads_arrived variable. We must perform
|
||||
// this increment (and a subsequent read) atomically.
|
||||
dim_t my_threads_arrived =
|
||||
__atomic_add_fetch( &comm->barrier_threads_arrived, 1, __ATOMIC_ACQ_REL );
|
||||
|
||||
// If the current thread was the last thread to have arrived, then
|
||||
// it will take actions that effectively ends and resets the barrier.
|
||||
if ( my_threads_arrived == comm->n_threads )
|
||||
{
|
||||
// Reset the variable tracking the number of threads that have arrived
|
||||
// to zero (which returns the barrier to the "empty" state. Then
|
||||
// atomically toggle the barrier sense variable. This will signal to
|
||||
// the other threads (which are spinning in the branch elow) that it
|
||||
// is now safe to exit the barrier.
|
||||
comm->barrier_threads_arrived = 0;
|
||||
__atomic_fetch_xor( &comm->barrier_sense, 1, __ATOMIC_RELEASE );
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the current thread is NOT the last thread to have arrived, then
|
||||
// it spins on the sense variable until that sense variable changes at
|
||||
// which time these threads will exit the barrier.
|
||||
while ( __atomic_load_n( &comm->barrier_sense, __ATOMIC_ACQUIRE ) == orig_sense )
|
||||
; // Empty loop body.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -49,11 +49,13 @@
|
||||
|
||||
// Thread communicator prototypes.
|
||||
thrcomm_t* bli_thrcomm_create( dim_t n_threads );
|
||||
void bli_thrcomm_free( thrcomm_t* communicator );
|
||||
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads );
|
||||
void bli_thrcomm_cleanup( thrcomm_t* communicator );
|
||||
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t thread_id );
|
||||
void* bli_thrcomm_bcast( thrcomm_t* communicator, dim_t inside_id, void* to_send );
|
||||
void bli_thrcomm_free( thrcomm_t* comm );
|
||||
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads );
|
||||
void bli_thrcomm_cleanup( thrcomm_t* comm );
|
||||
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t thread_id );
|
||||
void* bli_thrcomm_bcast( thrcomm_t* comm, dim_t inside_id, void* to_send );
|
||||
|
||||
void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -44,63 +44,66 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads )
|
||||
return comm;
|
||||
}
|
||||
|
||||
void bli_thrcomm_free( thrcomm_t* communicator )
|
||||
void bli_thrcomm_free( thrcomm_t* comm )
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
bli_thrcomm_cleanup( communicator );
|
||||
bli_free_intl( communicator );
|
||||
if ( comm == NULL ) return;
|
||||
bli_thrcomm_cleanup( comm );
|
||||
bli_free_intl( comm );
|
||||
}
|
||||
|
||||
#ifndef BLIS_TREE_BARRIER
|
||||
|
||||
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
|
||||
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
communicator->sent_object = NULL;
|
||||
communicator->n_threads = n_threads;
|
||||
communicator->barrier_sense = 0;
|
||||
communicator->barrier_threads_arrived = 0;
|
||||
if ( comm == NULL ) return;
|
||||
comm->sent_object = NULL;
|
||||
comm->n_threads = n_threads;
|
||||
comm->barrier_sense = 0;
|
||||
comm->barrier_threads_arrived = 0;
|
||||
}
|
||||
|
||||
|
||||
void bli_thrcomm_cleanup( thrcomm_t* communicator )
|
||||
void bli_thrcomm_cleanup( thrcomm_t* comm )
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
if ( comm == NULL ) return;
|
||||
}
|
||||
|
||||
//'Normal' barrier for openmp
|
||||
//barrier routine taken from art of multicore programming
|
||||
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id )
|
||||
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id )
|
||||
{
|
||||
if( communicator == NULL || communicator->n_threads == 1 )
|
||||
#if 0
|
||||
if ( comm == NULL || comm->n_threads == 1 )
|
||||
return;
|
||||
bool_t my_sense = communicator->barrier_sense;
|
||||
bool_t my_sense = comm->barrier_sense;
|
||||
dim_t my_threads_arrived;
|
||||
|
||||
_Pragma( "omp atomic capture" )
|
||||
my_threads_arrived = ++(communicator->barrier_threads_arrived);
|
||||
my_threads_arrived = ++(comm->barrier_threads_arrived);
|
||||
|
||||
if ( my_threads_arrived == communicator->n_threads )
|
||||
if ( my_threads_arrived == comm->n_threads )
|
||||
{
|
||||
communicator->barrier_threads_arrived = 0;
|
||||
communicator->barrier_sense = !communicator->barrier_sense;
|
||||
comm->barrier_threads_arrived = 0;
|
||||
comm->barrier_sense = !comm->barrier_sense;
|
||||
}
|
||||
else
|
||||
{
|
||||
volatile bool_t* listener = &communicator->barrier_sense;
|
||||
volatile bool_t* listener = &comm->barrier_sense;
|
||||
while ( *listener == my_sense ) {}
|
||||
}
|
||||
#endif
|
||||
bli_thrcomm_barrier_atomic( comm, t_id );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
|
||||
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
communicator->sent_object = NULL;
|
||||
communicator->n_threads = n_threads;
|
||||
communicator->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads );
|
||||
bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 );
|
||||
if ( comm == NULL ) return;
|
||||
comm->sent_object = NULL;
|
||||
comm->n_threads = n_threads;
|
||||
comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads );
|
||||
bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 );
|
||||
}
|
||||
|
||||
//Tree barrier used for Intel Xeon Phi
|
||||
@@ -145,14 +148,14 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_
|
||||
return me;
|
||||
}
|
||||
|
||||
void bli_thrcomm_cleanup( thrcomm_t* communicator )
|
||||
void bli_thrcomm_cleanup( thrcomm_t* comm )
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
for ( dim_t i = 0; i < communicator->n_threads; i++ )
|
||||
if ( comm == NULL ) return;
|
||||
for ( dim_t i = 0; i < comm->n_threads; i++ )
|
||||
{
|
||||
bli_thrcomm_tree_barrier_free( communicator->barriers[i] );
|
||||
bli_thrcomm_tree_barrier_free( comm->barriers[i] );
|
||||
}
|
||||
bli_free_intl( communicator->barriers );
|
||||
bli_free_intl( comm->barriers );
|
||||
}
|
||||
|
||||
void bli_thrcomm_tree_barrier_free( barrier_t* barrier )
|
||||
@@ -204,6 +207,7 @@ void bli_thrcomm_tree_barrier( barrier_t* barack )
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -231,7 +235,7 @@ void bli_l3_thread_decorator
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
|
||||
bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
|
||||
@@ -249,7 +253,7 @@ void bli_l3_thread_decorator
|
||||
);
|
||||
|
||||
// Free the control tree, if one was created locally.
|
||||
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread );
|
||||
bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread );
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
threads[id] = thread;
|
||||
|
||||
@@ -60,11 +60,12 @@ struct thrcomm_s
|
||||
#else
|
||||
struct thrcomm_s
|
||||
{
|
||||
void* sent_object;
|
||||
dim_t n_threads;
|
||||
void* sent_object;
|
||||
dim_t n_threads;
|
||||
|
||||
volatile bool_t barrier_sense;
|
||||
dim_t barrier_threads_arrived;
|
||||
//volatile bool_t barrier_sense;
|
||||
bool_t barrier_sense;
|
||||
dim_t barrier_threads_arrived;
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
@@ -43,81 +43,84 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads )
|
||||
return comm;
|
||||
}
|
||||
|
||||
void bli_thrcomm_free( thrcomm_t* communicator )
|
||||
void bli_thrcomm_free( thrcomm_t* comm )
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
bli_thrcomm_cleanup( communicator );
|
||||
bli_free_intl( communicator );
|
||||
if ( comm == NULL ) return;
|
||||
bli_thrcomm_cleanup( comm );
|
||||
bli_free_intl( comm );
|
||||
}
|
||||
|
||||
#ifdef BLIS_USE_PTHREAD_BARRIER
|
||||
|
||||
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
|
||||
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
communicator->sent_object = NULL;
|
||||
communicator->n_threads = n_threads;
|
||||
pthread_barrier_init( &communicator->barrier, NULL, n_threads );
|
||||
if ( comm == NULL ) return;
|
||||
comm->sent_object = NULL;
|
||||
comm->n_threads = n_threads;
|
||||
pthread_barrier_init( &comm->barrier, NULL, n_threads );
|
||||
}
|
||||
|
||||
void bli_thrcomm_cleanup( thrcomm_t* communicator )
|
||||
void bli_thrcomm_cleanup( thrcomm_t* comm )
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
pthread_barrier_destroy( &communicator->barrier );
|
||||
if ( comm == NULL ) return;
|
||||
pthread_barrier_destroy( &comm->barrier );
|
||||
}
|
||||
|
||||
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id )
|
||||
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id )
|
||||
{
|
||||
pthread_barrier_wait( &communicator->barrier );
|
||||
pthread_barrier_wait( &comm->barrier );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
|
||||
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
|
||||
{
|
||||
if ( communicator == NULL ) return;
|
||||
communicator->sent_object = NULL;
|
||||
communicator->n_threads = n_threads;
|
||||
communicator->sense = 0;
|
||||
communicator->threads_arrived = 0;
|
||||
if ( comm == NULL ) return;
|
||||
comm->sent_object = NULL;
|
||||
comm->n_threads = n_threads;
|
||||
comm->barrier_sense = 0;
|
||||
comm->barrier_threads_arrived = 0;
|
||||
|
||||
#ifdef BLIS_USE_PTHREAD_MUTEX
|
||||
pthread_mutex_init( &communicator->mutex, NULL );
|
||||
#endif
|
||||
//#ifdef BLIS_USE_PTHREAD_MUTEX
|
||||
// pthread_mutex_init( &comm->mutex, NULL );
|
||||
//#endif
|
||||
}
|
||||
|
||||
void bli_thrcomm_cleanup( thrcomm_t* communicator )
|
||||
void bli_thrcomm_cleanup( thrcomm_t* comm )
|
||||
{
|
||||
#ifdef BLIS_USE_PTHREAD_MUTEX
|
||||
if ( communicator == NULL ) return;
|
||||
pthread_mutex_destroy( &communicator->mutex );
|
||||
#endif
|
||||
//#ifdef BLIS_USE_PTHREAD_MUTEX
|
||||
// if ( comm == NULL ) return;
|
||||
// pthread_mutex_destroy( &comm->mutex );
|
||||
//#endif
|
||||
}
|
||||
|
||||
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id )
|
||||
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id )
|
||||
{
|
||||
if ( communicator == NULL || communicator->n_threads == 1 ) return;
|
||||
bool_t my_sense = communicator->sense;
|
||||
#if 0
|
||||
if ( comm == NULL || comm->n_threads == 1 ) return;
|
||||
bool_t my_sense = comm->sense;
|
||||
dim_t my_threads_arrived;
|
||||
|
||||
#ifdef BLIS_USE_PTHREAD_MUTEX
|
||||
pthread_mutex_lock( &communicator->mutex );
|
||||
my_threads_arrived = ++(communicator->threads_arrived);
|
||||
pthread_mutex_unlock( &communicator->mutex );
|
||||
pthread_mutex_lock( &comm->mutex );
|
||||
my_threads_arrived = ++(comm->threads_arrived);
|
||||
pthread_mutex_unlock( &comm->mutex );
|
||||
#else
|
||||
my_threads_arrived = __sync_add_and_fetch(&(communicator->threads_arrived), 1);
|
||||
my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1);
|
||||
#endif
|
||||
|
||||
if ( my_threads_arrived == communicator->n_threads )
|
||||
if ( my_threads_arrived == comm->n_threads )
|
||||
{
|
||||
communicator->threads_arrived = 0;
|
||||
communicator->sense = !communicator->sense;
|
||||
comm->threads_arrived = 0;
|
||||
comm->sense = !comm->sense;
|
||||
}
|
||||
else
|
||||
{
|
||||
volatile bool_t* listener = &communicator->sense;
|
||||
volatile bool_t* listener = &comm->sense;
|
||||
while( *listener == my_sense ) {}
|
||||
}
|
||||
#endif
|
||||
bli_thrcomm_barrier_atomic( comm, t_id );
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -129,6 +132,7 @@ void* bli_l3_thread_entry( void* data_void );
|
||||
typedef struct thread_data
|
||||
{
|
||||
l3int_t func;
|
||||
opid_t family;
|
||||
obj_t* alpha;
|
||||
obj_t* a;
|
||||
obj_t* b;
|
||||
@@ -145,6 +149,7 @@ void* bli_l3_thread_entry( void* data_void )
|
||||
{
|
||||
thread_data_t* data = data_void;
|
||||
|
||||
opid_t family = data->family;
|
||||
obj_t* alpha = data->alpha;
|
||||
obj_t* a = data->a;
|
||||
obj_t* b = data->b;
|
||||
@@ -159,7 +164,7 @@ void* bli_l3_thread_entry( void* data_void )
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
|
||||
bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
|
||||
@@ -177,7 +182,7 @@ void* bli_l3_thread_entry( void* data_void )
|
||||
);
|
||||
|
||||
// Free the control tree, if one was created locally.
|
||||
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread );
|
||||
bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( thread );
|
||||
@@ -188,6 +193,7 @@ void* bli_l3_thread_entry( void* data_void )
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -214,6 +220,7 @@ void bli_l3_thread_decorator
|
||||
{
|
||||
// Set up thread data for additional threads (beyond thread 0).
|
||||
datas[id].func = func;
|
||||
datas[id].family = family;
|
||||
datas[id].alpha = alpha;
|
||||
datas[id].a = a;
|
||||
datas[id].b = b;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user