resolving conflicts bli_gemm_front.c and LICENCE

Change-Id: Id24ce53896d4c1c7ceccc3e004014a0ecceb5474
This commit is contained in:
praveeng
2017-08-28 12:21:16 +05:30
110 changed files with 1544 additions and 723 deletions

View File

@@ -1,8 +1,6 @@
Copyright (C) 2017, Advanced Micro Devices, Inc.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

View File

@@ -85,9 +85,6 @@ TESTSUITE_CONF_GEN := input.general
TESTSUITE_CONF_OPS := input.operations
TESTSUITE_OUT_FILE := output.testsuite
# The name of the file where the version string is stored.
VERSION_FILE := version
# The name of the "special" directories, which contain source code that
# use non-standard compiler flags.
NOOPT_DIR := noopt
@@ -141,7 +138,6 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME)
# Construct the architecture-version string, which will be used to name the
# library upon installation.
VERSION := $(shell cat $(DIST_PATH)/$(VERSION_FILE))
VERS_CONF := $(VERSION)-$(CONFIG_NAME)
# --- Library names ---

View File

@@ -313,7 +313,7 @@ This project and its associated research was partially sponsored by grants from
[Microsoft](http://www.microsoft.com/), [Intel](http://www.intel.com/), [Texas
Instruments](http://www.ti.com/), and [AMD](http://www.amd.com/), as well as
grants from the [National Science Foundation](http://www.nsf.gov/) (Awards
CCF-0917167 ACI-1148125/1340293, and CCF-1320112).
CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493).
_Any opinions, findings and conclusions or recommendations expressed in this
material are those of the author(s) and do not necessarily reflect the views of

View File

@@ -36,6 +36,10 @@
ifndef CONFIG_MK_INCLUDED
CONFIG_MK_INCLUDED := yes
# The version string. This could be the official string or a custom
# string forced at configure-time.
VERSION := @version@
# The name of the configuration sub-directory.
CONFIG_NAME := @config_name@

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -59,7 +59,7 @@ CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -83,7 +83,7 @@ endif
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -58,7 +58,8 @@ CVECFLAGS :=
# --- Determine the archiver and related flags ---
AR := emar
ARFLAGS := cru
RANLIB := emranlib
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -83,7 +83,7 @@ endif
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -94,7 +94,7 @@ endif
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -81,7 +81,7 @@ endif
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -83,7 +83,7 @@ endif
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

View File

@@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)

36
configure vendored
View File

@@ -123,6 +123,12 @@ print_usage()
echo " compatibility layer. This automatically enables the"
echo " BLAS compatibility layer as well."
echo " "
echo " --force-version=STRING"
echo " "
echo " Force configure to use an arbitrary version string"
echo " STRING. This option may be useful when repackaging"
echo " custom versions of BLIS by outside organizations."
echo " "
echo " -h, --help Output this information and quit."
echo " "
echo " Environment Variables:"
@@ -232,6 +238,7 @@ main()
blas2blis_int_type_size=32
enable_blas2blis='yes'
enable_cblas='no'
force_version='no'
# The path to the auto-detection script.
auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh"
@@ -247,14 +254,6 @@ main()
dummy_file='_blis_dir_detect.tmp'
# Check whether we need to update the version file.
${update_version_file_sh} -o "${script_name}" "${version_filepath}"
# Query which version of BLIS this is.
version=$(cat ${version_filepath})
# Process our command line options.
while getopts ":hp:d:t:qi:b:-:" opt; do
case $opt in
@@ -323,6 +322,9 @@ main()
disable-cblas)
enable_cblas='no'
;;
force-version=*)
force_version=${OPTARG#*=}
;;
*)
print_usage
;;
@@ -375,10 +377,27 @@ main()
done
# Check whether we need to update the version file.
${update_version_file_sh} -o "${script_name}" "${version_filepath}"
# Query which version of BLIS this is.
version=$(cat ${version_filepath})
# Initial message.
echo "${script_name}: starting configuration of BLIS ${version}."
# Check if the user requested a custom version string.
if [ "x${force_version}" = "xno" ]; then
echo "${script_name}: configuring with official version string."
else
echo "${script_name}: configuring with custom version string '${force_version}'."
version="${force_version}"
fi
# Set config_name based on the number of arguments leftover (after command
# line option processing).
if [ $# = "0" ]; then
@@ -574,6 +593,7 @@ main()
# to config_mk_out.
echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}"
cat "${config_mk_in_path}" \
| sed "s/@version@/${version}/g" \
| sed "s/@config_name@/${config_name}/g" \
| sed "s/@dist_path@/${dist_path_esc}/g" \
| sed "s/@CC@/${cc_esc}/g" \

View File

@@ -43,7 +43,7 @@
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with the kernel associated with the current
operation. */ \
@@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( addv, BLIS_ADDV_KER )
@@ -70,7 +70,7 @@ GENFRONT( swapv, BLIS_SWAPV_KER )
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
@@ -84,7 +84,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv )
@@ -95,7 +95,7 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv )
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
@@ -106,7 +106,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( axpyv, BLIS_AXPYV_KER, addv )
@@ -118,7 +118,7 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv )
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(dep1,_cntx_init)( dt, cntx ); \
@@ -130,7 +130,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( scal2v, BLIS_SCAL2V_KER, setv, copyv )

View File

@@ -47,7 +47,7 @@ void bli_packv_cntl_init( void )
void bli_packv_cntl_finalize( void )
{
bli_cntl_obj_free( packv_cntl );
bli_cntl_free_node( packv_cntl );
}
packv_t* bli_packv_cntl_obj_create( impl_t impl_type,
@@ -105,7 +105,7 @@ cntl_t* bli_packv_cntl_obj_create
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_obj_create
cntl = bli_cntl_create_node
(
BLIS_NO_PART,
var_func,

View File

@@ -44,7 +44,7 @@ void bli_scalv_cntl_init()
void bli_scalv_cntl_finalize()
{
bli_cntl_obj_free( scalv_cntl );
bli_cntl_free_node( scalv_cntl );
}

View File

@@ -44,7 +44,7 @@ void bli_unpackv_cntl_init()
void bli_unpackv_cntl_finalize()
{
bli_cntl_obj_free( unpackv_cntl );
bli_cntl_free_node( unpackv_cntl );
}
unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type,

View File

@@ -43,7 +43,7 @@
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
@@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
\
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( addd, addv )

View File

@@ -43,7 +43,7 @@
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
@@ -54,7 +54,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv )
@@ -65,7 +65,7 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv )
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
@@ -77,7 +77,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv )
@@ -88,7 +88,7 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv )
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
@@ -105,7 +105,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv )
@@ -116,7 +116,7 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv )
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
@@ -135,7 +135,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
} \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( dotxf, BLIS_DOTXF_KER, dotv, dotxv )

View File

@@ -43,7 +43,7 @@
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname,_cntx_init)( dt, cntx ); \
@@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
\
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( addm, addv )
@@ -66,7 +66,7 @@ GENFRONT( subm, subv )
\
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernel dependencies. */ \
PASTEMAC(depname1,_cntx_init)( dt, cntx ); \
@@ -75,7 +75,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
\
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( copym, copyv, setv )

View File

@@ -34,7 +34,7 @@
#include "blis.h"
cntl_t* bli_packm_cntl_obj_create
cntl_t* bli_packm_cntl_create_node
(
void* var_func,
void* packm_var_func,
@@ -69,8 +69,9 @@ cntl_t* bli_packm_cntl_obj_create
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_obj_create
cntl = bli_cntl_create_node
(
BLIS_NOID,
BLIS_NO_PART,
var_func,
params,

View File

@@ -80,7 +80,7 @@ typedef struct packm_params_s packm_params_t;
// -----------------------------------------------------------------------------
cntl_t* bli_packm_cntl_obj_create
cntl_t* bli_packm_cntl_create_node
(
void* var_func,
void* packm_var_func,

View File

@@ -41,7 +41,7 @@
void bli_packm_cntx_init( num_t dt, cntx_t* cntx )
{
bli_cntx_obj_create( cntx );
bli_cntx_create( cntx );
// Initialize the context with kernels that may be needed for the
// current operation.
@@ -57,5 +57,5 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx )
void bli_packm_cntx_finalize( cntx_t* cntx )
{
bli_cntx_obj_free( cntx );
bli_cntx_free( cntx );
}

View File

@@ -34,7 +34,7 @@
#include "blis.h"
cntl_t* bli_scalm_cntl_obj_create
cntl_t* bli_scalm_cntl_create_node
(
void* var_func,
cntl_t* sub_node
@@ -46,8 +46,9 @@ cntl_t* bli_scalm_cntl_obj_create
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_obj_create
cntl = bli_cntl_create_node
(
BLIS_NOID,
BLIS_NO_PART,
var_func,
NULL,

View File

@@ -33,7 +33,7 @@
*/
cntl_t* bli_scalm_cntl_obj_create
cntl_t* bli_scalm_cntl_create_node
(
void* var_func,
cntl_t* sub_node

View File

@@ -34,7 +34,7 @@
#include "blis.h"
cntl_t* bli_unpackm_cntl_obj_create
cntl_t* bli_unpackm_cntl_create_node
(
void* var_func,
void* unpackm_var_func,
@@ -55,8 +55,9 @@ cntl_t* bli_unpackm_cntl_obj_create
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_obj_create
cntl = bli_cntl_create_node
(
BLIS_NOID,
BLIS_NO_PART,
var_func,
params,

View File

@@ -45,7 +45,7 @@ typedef struct unpackm_params_s unpackm_params_t;
// -----------------------------------------------------------------------------
cntl_t* bli_unpackm_cntl_obj_create
cntl_t* bli_unpackm_cntl_create_node
(
void* var_func,
void* unpackm_var_func,

View File

@@ -45,7 +45,7 @@
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernels employed by the current
operation. */ \
@@ -127,7 +127,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
/* Free the context and all memory allocated to it. */ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( trmv )
GENFRONT( trsv )
@@ -139,7 +139,7 @@ GENFRONT( trsv )
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernels employed by the current
operation. */ \
@@ -159,7 +159,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
/* Free the context and all memory allocated to it. */ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( ger )
@@ -173,7 +173,7 @@ GENFRONT( syr )
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernels employed by the current
operation. */ \
@@ -211,7 +211,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
/* Free the context and all memory allocated to it. */ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( hemv )
@@ -224,7 +224,7 @@ GENFRONT( symv )
void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
{ \
/* Perform basic setup on the context. */ \
bli_cntx_obj_create( cntx ); \
bli_cntx_create( cntx ); \
\
/* Initialize the context with kernels employed by the current
operation. */ \
@@ -246,7 +246,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \
void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \
{ \
/* Free the context and all memory allocated to it. */ \
bli_cntx_obj_free( cntx ); \
bli_cntx_free( cntx ); \
}
GENFRONT( her2 )

View File

@@ -152,17 +152,17 @@ void bli_gemv_cntl_init()
void bli_gemv_cntl_finalize()
{
bli_cntl_obj_free( gemv_cntl_bs_ke_dot );
bli_cntl_obj_free( gemv_cntl_bs_ke_axpy );
bli_cntl_free_node( gemv_cntl_bs_ke_dot );
bli_cntl_free_node( gemv_cntl_bs_ke_axpy );
bli_cntl_obj_free( gemv_cntl_rp_bs_dot );
bli_cntl_obj_free( gemv_cntl_rp_bs_axpy );
bli_cntl_free_node( gemv_cntl_rp_bs_dot );
bli_cntl_free_node( gemv_cntl_rp_bs_axpy );
bli_cntl_obj_free( gemv_cntl_cp_bs_dot );
bli_cntl_obj_free( gemv_cntl_cp_bs_axpy );
bli_cntl_free_node( gemv_cntl_cp_bs_dot );
bli_cntl_free_node( gemv_cntl_cp_bs_axpy );
bli_cntl_obj_free( gemv_cntl_ge_dot );
bli_cntl_obj_free( gemv_cntl_ge_axpy );
bli_cntl_free_node( gemv_cntl_ge_dot );
bli_cntl_free_node( gemv_cntl_ge_axpy );
}

View File

@@ -145,17 +145,17 @@ void bli_ger_cntl_init()
void bli_ger_cntl_finalize()
{
bli_cntl_obj_free( ger_cntl_bs_ke_row );
bli_cntl_obj_free( ger_cntl_bs_ke_col );
bli_cntl_free_node( ger_cntl_bs_ke_row );
bli_cntl_free_node( ger_cntl_bs_ke_col );
bli_cntl_obj_free( ger_cntl_rp_bs_row );
bli_cntl_obj_free( ger_cntl_rp_bs_col );
bli_cntl_free_node( ger_cntl_rp_bs_row );
bli_cntl_free_node( ger_cntl_rp_bs_col );
bli_cntl_obj_free( ger_cntl_cp_bs_row );
bli_cntl_obj_free( ger_cntl_cp_bs_col );
bli_cntl_free_node( ger_cntl_cp_bs_row );
bli_cntl_free_node( ger_cntl_cp_bs_col );
bli_cntl_obj_free( ger_cntl_ge_row );
bli_cntl_obj_free( ger_cntl_ge_col );
bli_cntl_free_node( ger_cntl_ge_row );
bli_cntl_free_node( ger_cntl_ge_col );
}

View File

@@ -108,10 +108,10 @@ void bli_hemv_cntl_init()
void bli_hemv_cntl_finalize()
{
bli_cntl_obj_free( hemv_cntl_bs_ke_lrow_ucol );
bli_cntl_obj_free( hemv_cntl_bs_ke_lcol_urow );
bli_cntl_obj_free( hemv_cntl_ge_lrow_ucol );
bli_cntl_obj_free( hemv_cntl_ge_lcol_urow );
bli_cntl_free_node( hemv_cntl_bs_ke_lrow_ucol );
bli_cntl_free_node( hemv_cntl_bs_ke_lcol_urow );
bli_cntl_free_node( hemv_cntl_ge_lrow_ucol );
bli_cntl_free_node( hemv_cntl_ge_lcol_urow );
}

View File

@@ -97,10 +97,10 @@ void bli_her_cntl_init()
void bli_her_cntl_finalize()
{
bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol );
bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow );
bli_cntl_obj_free( her_cntl_ge_lrow_ucol );
bli_cntl_obj_free( her_cntl_ge_lcol_urow );
bli_cntl_free_node( her_cntl_bs_ke_lrow_ucol );
bli_cntl_free_node( her_cntl_bs_ke_lcol_urow );
bli_cntl_free_node( her_cntl_ge_lrow_ucol );
bli_cntl_free_node( her_cntl_ge_lcol_urow );
}

View File

@@ -101,10 +101,10 @@ void bli_her2_cntl_init()
void bli_her2_cntl_finalize()
{
bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol );
bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow );
bli_cntl_obj_free( her2_cntl_ge_lrow_ucol );
bli_cntl_obj_free( her2_cntl_ge_lcol_urow );
bli_cntl_free_node( her2_cntl_bs_ke_lrow_ucol );
bli_cntl_free_node( her2_cntl_bs_ke_lcol_urow );
bli_cntl_free_node( her2_cntl_ge_lrow_ucol );
bli_cntl_free_node( her2_cntl_ge_lcol_urow );
}

View File

@@ -98,10 +98,10 @@ void bli_trmv_cntl_init()
void bli_trmv_cntl_finalize()
{
bli_cntl_obj_free( trmv_cntl_bs_ke_nrow_tcol );
bli_cntl_obj_free( trmv_cntl_bs_ke_ncol_trow );
bli_cntl_obj_free( trmv_cntl_ge_nrow_tcol );
bli_cntl_obj_free( trmv_cntl_ge_ncol_trow );
bli_cntl_free_node( trmv_cntl_bs_ke_nrow_tcol );
bli_cntl_free_node( trmv_cntl_bs_ke_ncol_trow );
bli_cntl_free_node( trmv_cntl_ge_nrow_tcol );
bli_cntl_free_node( trmv_cntl_ge_ncol_trow );
}

View File

@@ -101,10 +101,10 @@ void bli_trsv_cntl_init()
void bli_trsv_cntl_finalize()
{
bli_cntl_obj_free( trsv_cntl_bs_ke_nrow_tcol );
bli_cntl_obj_free( trsv_cntl_bs_ke_ncol_trow );
bli_cntl_obj_free( trsv_cntl_ge_nrow_tcol );
bli_cntl_obj_free( trsv_cntl_ge_ncol_trow );
bli_cntl_free_node( trsv_cntl_bs_ke_nrow_tcol );
bli_cntl_free_node( trsv_cntl_bs_ke_ncol_trow );
bli_cntl_free_node( trsv_cntl_ge_nrow_tcol );
bli_cntl_free_node( trsv_cntl_ge_ncol_trow );
}

View File

@@ -43,10 +43,11 @@ dim_t bli_l3_determine_kc
obj_t* a,
obj_t* b,
bszid_t bszid,
cntx_t* cntx
cntx_t* cntx,
cntl_t* cntl
)
{
opid_t family = bli_cntx_family( cntx );
opid_t family = bli_cntl_family( cntl );
if ( family == BLIS_GEMM )
return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );

View File

@@ -32,6 +32,18 @@
*/
dim_t bli_l3_determine_kc
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* a,
obj_t* b,
bszid_t bszid,
cntx_t* cntx,
cntl_t* cntl
);
#undef GENPROT
#define GENPROT( opname ) \
@@ -47,8 +59,6 @@ dim_t PASTEMAC0(opname) \
cntx_t* cntx \
);
GENPROT( l3_determine_kc )
GENPROT( gemm_determine_kc )
GENPROT( herk_determine_kc )
GENPROT( trmm_determine_kc )

View File

@@ -37,10 +37,10 @@
void bli_l3_cntl_create_if
(
opid_t family,
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl_orig,
cntl_t** cntl_use
)
@@ -49,8 +49,6 @@ void bli_l3_cntl_create_if
// tree as a function of the operation family.
if ( cntl_orig == NULL )
{
opid_t family = bli_cntx_get_family( cntx );
if ( family == BLIS_GEMM ||
family == BLIS_HERK ||
family == BLIS_TRMM )
@@ -73,6 +71,10 @@ void bli_l3_cntl_create_if
// instead (so that threads can use its local tree as a place to
// cache things like pack mem_t entries).
*cntl_use = bli_cntl_copy( cntl_orig );
// Recursively set the family fields of the newly copied control tree
// nodes.
bli_cntl_mark_family( family, *cntl_use );
}
}
@@ -81,7 +83,6 @@ void bli_l3_cntl_free_if
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl_orig,
cntl_t* cntl_use,
thrinfo_t* thread
@@ -91,7 +92,7 @@ void bli_l3_cntl_free_if
// been created, so we now must free it.
if ( cntl_orig == NULL )
{
opid_t family = bli_cntx_get_family( cntx );
opid_t family = bli_cntl_family( cntl_use );
if ( family == BLIS_GEMM ||
family == BLIS_HERK ||

View File

@@ -39,10 +39,10 @@
void bli_l3_cntl_create_if
(
opid_t family,
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl_orig,
cntl_t** cntl_use
);
@@ -52,7 +52,6 @@ void bli_l3_cntl_free_if
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl_orig,
cntl_t* cntl_use,
thrinfo_t* thread

View File

@@ -41,7 +41,7 @@
void bli_gemm_cntx_init( num_t dt, cntx_t* cntx )
{
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -76,7 +76,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx )
void bli_trsm_cntx_init( num_t dt, cntx_t* cntx )
{
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.

View File

@@ -39,11 +39,11 @@ dir_t bli_l3_direct
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
cntl_t* cntl
)
{
// Query the operation family.
opid_t family = bli_cntx_family( cntx );
opid_t family = bli_cntl_family( cntl );
if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c );
else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c );

View File

@@ -37,7 +37,7 @@ dir_t bli_l3_direct
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
cntl_t* cntl
);
// -----------------------------------------------------------------------------

View File

@@ -115,12 +115,13 @@ void bli_l3_packm
// buffer, then a block has already been acquired from the memory
// broker and cached in the control tree.
// BUT, we need to make sure that the mem_t object is not associated
// with a block that is too small given the size of the packed matrix
// that we need, according to the return value from packm_init().
// As a sanity check, we should make sure that the mem_t object isn't
// associated with a block that is too small compared to the size of
// the packed matrix buffer that is needed, according to the return
// value from packm_init().
siz_t cntl_mem_size = bli_mem_size( cntl_mem_p );
if ( size_needed < cntl_mem_size )
if ( cntl_mem_size < size_needed )
{
if ( bli_thread_am_ochief( thread ) )
{

View File

@@ -40,11 +40,11 @@ void bli_l3_prune_unref_mparts_m
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
cntl_t* cntl
)
{
// Query the operation family.
opid_t family = bli_cntx_family( cntx );
opid_t family = bli_cntl_family( cntl );
if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm.
else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c );
@@ -61,11 +61,11 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx \
cntl_t* cntl \
) \
{ \
/* Query the operation family. */ \
opid_t family = bli_cntx_family( cntx ); \
opid_t family = bli_cntl_family( cntl ); \
\
if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \
else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \

View File

@@ -41,7 +41,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx \
cntl_t* cntl \
);
GENPROT( m )

View File

@@ -53,10 +53,10 @@ void bli_gemm_blk_var1
dim_t my_start, my_end;
// Determine the direction in which to partition (forwards or backwards).
direct = bli_l3_direct( a, b, c, cntx );
direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_m( a, b, c, cntx );
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_mdim

View File

@@ -53,10 +53,10 @@ void bli_gemm_blk_var2
dim_t my_start, my_end;
// Determine the direction in which to partition (forwards or backwards).
direct = bli_l3_direct( a, b, c, cntx );
direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_n( a, b, c, cntx );
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_ndim

View File

@@ -53,10 +53,10 @@ void bli_gemm_blk_var3
dim_t k_trans;
// Determine the direction in which to partition (forwards or backwards).
direct = bli_l3_direct( a, b, c, cntx );
direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_k( a, b, c, cntx );
bli_l3_prune_unref_mparts_k( a, b, c, cntl );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
@@ -66,7 +66,7 @@ void bli_gemm_blk_var3
{
// Determine the current algorithmic blocksize.
b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b,
bli_cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx, cntl );
// Acquire partitions for A1 and B1.
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
@@ -109,7 +109,7 @@ void bli_gemm_blk_var3
// row-panel of C, and thus beta is applied to all of C exactly once.
// Thus, for neither trmm nor trmm3 should we reset the scalar on C
// after the first iteration.
if ( bli_cntx_get_family( cntx ) != BLIS_TRMM )
if ( bli_cntl_family( cntl ) != BLIS_TRMM )
if ( i == 0 ) bli_obj_scalar_reset( c );
}
}

View File

@@ -56,22 +56,24 @@ cntl_t* bli_gemmbp_cntl_create
else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
(
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node
(
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
gemm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
@@ -86,15 +88,16 @@ cntl_t* bli_gemmbp_cntl_create
);
// Create a node for partitioning the m dimension by MC.
cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node
(
family,
BLIS_MC,
bli_gemm_blk_var1,
gemm_cntl_packa
);
// Create a node for packing matrix B.
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
(
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
@@ -109,16 +112,18 @@ cntl_t* bli_gemmbp_cntl_create
);
// Create a node for partitioning the k dimension by KC.
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
(
family,
BLIS_KC,
bli_gemm_blk_var3,
gemm_cntl_packb
);
// Create a node for partitioning the n dimension by NC.
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
(
family,
BLIS_NC,
bli_gemm_blk_var2,
gemm_cntl_mm_op
@@ -141,15 +146,17 @@ cntl_t* bli_gemmpb_cntl_create
//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node
(
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node
(
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
gemm_cntl_ub_ke
@@ -157,7 +164,7 @@ cntl_t* bli_gemmpb_cntl_create
// Create a node for packing matrix A (which is really the right-hand
// operand "B").
cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
(
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
@@ -172,8 +179,9 @@ cntl_t* bli_gemmpb_cntl_create
);
// Create a node for partitioning the n dimension by MC.
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node
(
family,
BLIS_MC,
bli_gemm_blk_var2,
gemm_cntl_packb
@@ -181,7 +189,7 @@ cntl_t* bli_gemmpb_cntl_create
// Create a node for packing matrix B (which is really the left-hand
// operand "A").
cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
@@ -196,16 +204,18 @@ cntl_t* bli_gemmpb_cntl_create
);
// Create a node for partitioning the k dimension by KC.
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
(
family,
BLIS_KC,
bli_gemm_blk_var3,
gemm_cntl_packa
);
// Create a node for partitioning the m dimension by NC.
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
(
family,
BLIS_NC,
bli_gemm_blk_var1,
gemm_cntl_mm_op
@@ -227,13 +237,14 @@ void bli_gemm_cntl_free
// -----------------------------------------------------------------------------
cntl_t* bli_gemm_cntl_obj_create
cntl_t* bli_gemm_cntl_create_node
(
opid_t family,
bszid_t bszid,
void* var_func,
cntl_t* sub_node
)
{
return bli_cntl_obj_create( bszid, var_func, NULL, sub_node );
return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node );
}

View File

@@ -59,8 +59,9 @@ void bli_gemm_cntl_free
// -----------------------------------------------------------------------------
cntl_t* bli_gemm_cntl_obj_create
cntl_t* bli_gemm_cntl_create_node
(
opid_t family,
bszid_t bszid,
void* var_func,
cntl_t* sub_node

View File

@@ -46,70 +46,68 @@ void bli_gemm_front
cntl_t* cntl
)
{
#ifdef BLIS_SMALL_MATRIX_ENABLE
gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl);
if(BLIS_SUCCESS != status)
#endif
{
obj_t a_local;
obj_t b_local;
obj_t c_local;
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Reinitialize the memory allocator to accommodate the blocksizes
// in the current context.
bli_memsys_reinit( cntx );
// Reinitialize the memory allocator to accommodate the blocksizes
// in the current context.
bli_memsys_reinit( cntx );
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *b, b_local );
bli_obj_alias_to( *c, c_local );
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( *a, a_local );
bli_obj_alias_to( *b, b_local );
bli_obj_alias_to( *c, c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( a_local, b_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( a_local, b_local );
bli_obj_induce_trans( a_local );
bli_obj_induce_trans( b_local );
bli_obj_induce_trans( c_local );
}
bli_obj_induce_trans( a_local );
bli_obj_induce_trans( b_local );
bli_obj_induce_trans( c_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_GEMM, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
bli_obj_width( c_local ),
bli_obj_width( a_local ) );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
bli_obj_width( c_local ),
bli_obj_width( a_local ) );
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
cntl
);
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
cntl
);
}
}

View File

@@ -89,9 +89,6 @@ void bli_hemm_front
bli_obj_swap( a_local, b_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_GEMM, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
@@ -102,6 +99,7 @@ void bli_hemm_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,

View File

@@ -107,9 +107,6 @@ void bli_her2k_front
bli_obj_induce_trans( c_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
@@ -122,6 +119,7 @@ void bli_her2k_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_HERK, // operation family id
alpha,
&a_local,
&bh_local,
@@ -134,6 +132,7 @@ void bli_her2k_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_HERK, // operation family id
&alpha_conj,
&b_local,
&ah_local,

View File

@@ -87,9 +87,6 @@ void bli_herk_front
bli_obj_induce_trans( c_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
@@ -100,6 +97,7 @@ void bli_herk_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_HERK, // operation family id
alpha,
&a_local,
&ah_local,

View File

@@ -88,9 +88,6 @@ void bli_symm_front
bli_obj_swap( a_local, b_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_GEMM, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
@@ -101,6 +98,7 @@ void bli_symm_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,

View File

@@ -88,9 +88,6 @@ void bli_syr2k_front
bli_obj_induce_trans( c_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
@@ -103,6 +100,7 @@ void bli_syr2k_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_HERK, // operation family id
alpha,
&a_local,
&bt_local,
@@ -115,6 +113,7 @@ void bli_syr2k_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_HERK, // operation family id
alpha,
&b_local,
&at_local,

View File

@@ -81,9 +81,6 @@ void bli_syrk_front
bli_obj_induce_trans( c_local );
}
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_HERK, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx,
bli_obj_length( c_local ),
@@ -94,6 +91,7 @@ void bli_syrk_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_HERK, // operation family id
alpha,
&a_local,
&at_local,

View File

@@ -131,9 +131,6 @@ void bli_trmm_front
bli_obj_set_as_root( b_local );
bli_obj_set_as_root( c_local );
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_TRMM, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx,
bli_obj_length( c_local ),
@@ -144,6 +141,7 @@ void bli_trmm_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_TRMM, // operation family id
alpha,
&a_local,
&b_local,

View File

@@ -130,9 +130,6 @@ void bli_trmm3_front
bli_obj_set_as_root( b_local );
bli_obj_set_as_root( c_local );
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_TRMM, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx,
bli_obj_length( c_local ),
@@ -143,6 +140,7 @@ void bli_trmm3_front
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_TRMM, // operation family id
alpha,
&a_local,
&b_local,

View File

@@ -53,10 +53,10 @@ void bli_trsm_blk_var1
dim_t my_start, my_end;
// Determine the direction in which to partition (forwards or backwards).
direct = bli_l3_direct( a, b, c, cntx );
direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_m( a, b, c, cntx );
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_mdim

View File

@@ -53,10 +53,10 @@ void bli_trsm_blk_var2
dim_t my_start, my_end;
// Determine the direction in which to partition (forwards or backwards).
direct = bli_l3_direct( a, b, c, cntx );
direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_n( a, b, c, cntx );
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_ndim

View File

@@ -53,10 +53,10 @@ void bli_trsm_blk_var3
dim_t k_trans;
// Determine the direction in which to partition (forwards or backwards).
direct = bli_l3_direct( a, b, c, cntx );
direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_k( a, b, c, cntx );
bli_l3_prune_unref_mparts_k( a, b, c, cntl );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );

View File

@@ -50,23 +50,27 @@ cntl_t* bli_trsm_l_cntl_create
{
void* macro_kernel_p = bli_trsm_xx_ker_var2;
const opid_t family = BLIS_TRSM;
// Create two nodes for the macro-kernel.
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
(
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
(
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
trsm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
bli_trsm_packa,
bli_packm_blk_var1,
@@ -81,15 +85,16 @@ cntl_t* bli_trsm_l_cntl_create
);
// Create a node for partitioning the m dimension by MC.
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
(
family,
BLIS_MC,
bli_trsm_blk_var1,
trsm_cntl_packa
);
// Create a node for packing matrix B.
cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
bli_trsm_packb,
bli_packm_blk_var1,
@@ -104,16 +109,18 @@ cntl_t* bli_trsm_l_cntl_create
);
// Create a node for partitioning the k dimension by KC.
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
(
family,
BLIS_KC,
bli_trsm_blk_var3,
trsm_cntl_packb
);
// Create a node for partitioning the n dimension by NC.
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
(
family,
BLIS_NC,
bli_trsm_blk_var2,
trsm_cntl_mm_op
@@ -129,23 +136,27 @@ cntl_t* bli_trsm_r_cntl_create
{
void* macro_kernel_p = bli_trsm_xx_ker_var2;
const opid_t family = BLIS_TRSM;
// Create two nodes for the macro-kernel.
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
(
family,
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
(
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
trsm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
bli_trsm_packa,
bli_packm_blk_var1,
@@ -160,15 +171,16 @@ cntl_t* bli_trsm_r_cntl_create
);
// Create a node for partitioning the m dimension by MC.
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
(
family,
BLIS_MC,
bli_trsm_blk_var1,
trsm_cntl_packa
);
// Create a node for packing matrix B.
cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
bli_trsm_packb,
bli_packm_blk_var1,
@@ -183,16 +195,18 @@ cntl_t* bli_trsm_r_cntl_create
);
// Create a node for partitioning the k dimension by KC.
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
(
family,
BLIS_KC,
bli_trsm_blk_var3,
trsm_cntl_packb
);
// Create a node for partitioning the n dimension by NC.
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
(
family,
BLIS_NC,
bli_trsm_blk_var2,
trsm_cntl_mm_op
@@ -212,13 +226,14 @@ void bli_trsm_cntl_free
// -----------------------------------------------------------------------------
cntl_t* bli_trsm_cntl_obj_create
cntl_t* bli_trsm_cntl_create_node
(
opid_t family,
bszid_t bszid,
void* var_func,
cntl_t* sub_node
)
{
return bli_cntl_obj_create( bszid, var_func, NULL, sub_node );
return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node );
}

View File

@@ -55,8 +55,9 @@ void bli_trsm_cntl_free
// -----------------------------------------------------------------------------
cntl_t* bli_trsm_cntl_obj_create
cntl_t* bli_trsm_cntl_create_node
(
opid_t family,
bszid_t bszid,
void* var_func,
cntl_t* sub_node

View File

@@ -122,9 +122,6 @@ void bli_trsm_front
bli_obj_set_as_root( b_local );
bli_obj_set_as_root( c_local );
// Set the operation family id in the context.
bli_cntx_set_family( BLIS_TRSM, cntx );
// Record the threading for each level within the context.
bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx,
bli_obj_length( c_local ),
@@ -135,6 +132,7 @@ void bli_trsm_front
bli_l3_thread_decorator
(
bli_trsm_int,
BLIS_TRSM, // operation family id
alpha,
&a_local,
&b_local,

View File

@@ -64,7 +64,7 @@ void bli_trsm_cntl_init()
// Create control tree objects for packm operations (left side).
trsm_l_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
bli_packm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT1,
// IMPORTANT: n dim multiple must be mr to
// support right and bottom-right edge cases
@@ -78,7 +78,7 @@ void bli_trsm_cntl_init()
trsm_l_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
bli_packm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT1,
// IMPORTANT: m dim multiple must be mr since
// B_pack is updated (ie: serves as C) in trsm
@@ -93,7 +93,7 @@ void bli_trsm_cntl_init()
// Create control tree objects for packm operations (right side).
trsm_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
bli_packm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_NR,
BLIS_MR,
@@ -105,7 +105,7 @@ void bli_trsm_cntl_init()
trsm_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
bli_packm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT1, // pack panels of B compactly
BLIS_MR,
BLIS_MR,
@@ -119,7 +119,7 @@ void bli_trsm_cntl_init()
// Create control tree object for lowest-level block-panel kernel.
trsm_cntl_bp_ke
=
bli_trsm_cntl_obj_create( BLIS_UNB_OPT,
bli_trsm_cntl_create_node( BLIS_UNB_OPT,
BLIS_VARIANT2,
0, // bszid_t not used by macro-kernel
NULL, NULL, NULL, NULL,
@@ -129,7 +129,7 @@ void bli_trsm_cntl_init()
// problem (left side).
trsm_l_cntl_op_bp
=
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
bli_trsm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_MC,
NULL,
@@ -144,7 +144,7 @@ void bli_trsm_cntl_init()
// rank-k (outer panel) updates (left side).
trsm_l_cntl_mm_op
=
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
bli_trsm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_KC,
NULL,
@@ -159,7 +159,7 @@ void bli_trsm_cntl_init()
// general problems (left side).
trsm_l_cntl_vl_mm
=
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
bli_trsm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_NC,
NULL,
@@ -174,7 +174,7 @@ void bli_trsm_cntl_init()
// problem (right side).
trsm_r_cntl_op_bp
=
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
bli_trsm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_MC,
NULL,
@@ -189,7 +189,7 @@ void bli_trsm_cntl_init()
// rank-k (outer panel) updates (right side).
trsm_r_cntl_mm_op
=
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
bli_trsm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT3,
BLIS_KC,
NULL,
@@ -204,7 +204,7 @@ void bli_trsm_cntl_init()
// general problems (right side).
trsm_r_cntl_vl_mm
=
bli_trsm_cntl_obj_create( BLIS_BLOCKED,
bli_trsm_cntl_create_node( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_NC,
NULL,
@@ -222,22 +222,22 @@ void bli_trsm_cntl_init()
void bli_trsm_cntl_finalize()
{
bli_cntl_obj_free( trsm_l_packa_cntl );
bli_cntl_obj_free( trsm_l_packb_cntl );
bli_cntl_obj_free( trsm_r_packa_cntl );
bli_cntl_obj_free( trsm_r_packb_cntl );
bli_cntl_free_node( trsm_l_packa_cntl );
bli_cntl_free_node( trsm_l_packb_cntl );
bli_cntl_free_node( trsm_r_packa_cntl );
bli_cntl_free_node( trsm_r_packb_cntl );
bli_cntl_obj_free( trsm_cntl_bp_ke );
bli_cntl_free_node( trsm_cntl_bp_ke );
bli_cntl_obj_free( trsm_l_cntl_op_bp );
bli_cntl_obj_free( trsm_l_cntl_mm_op );
bli_cntl_obj_free( trsm_l_cntl_vl_mm );
bli_cntl_obj_free( trsm_r_cntl_op_bp );
bli_cntl_obj_free( trsm_r_cntl_mm_op );
bli_cntl_obj_free( trsm_r_cntl_vl_mm );
bli_cntl_free_node( trsm_l_cntl_op_bp );
bli_cntl_free_node( trsm_l_cntl_mm_op );
bli_cntl_free_node( trsm_l_cntl_vl_mm );
bli_cntl_free_node( trsm_r_cntl_op_bp );
bli_cntl_free_node( trsm_r_cntl_mm_op );
bli_cntl_free_node( trsm_r_cntl_vl_mm );
}
trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type,
trsm_t* bli_trsm_cntl_create_node( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalm_t* sub_scalm,

View File

@@ -51,7 +51,7 @@ typedef struct trsm_s trsm_t;
void bli_trsm_cntl_init( void );
void bli_trsm_cntl_finalize( void );
trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type,
trsm_t* bli_trsm_cntl_create_node( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalm_t* sub_scalm,

View File

@@ -35,7 +35,7 @@
#include "blis.h"
blksz_t* bli_blksz_obj_create
blksz_t* bli_blksz_create_ed
(
dim_t b_s, dim_t be_s,
dim_t b_d, dim_t be_d,
@@ -47,16 +47,39 @@ blksz_t* bli_blksz_obj_create
b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) );
bli_blksz_obj_init( b,
b_s, be_s,
b_d, be_d,
b_c, be_c,
b_z, be_z );
bli_blksz_init_ed
(
b,
b_s, be_s,
b_d, be_d,
b_c, be_c,
b_z, be_z
);
return b;
}
void bli_blksz_obj_init
blksz_t* bli_blksz_create
(
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
)
{
blksz_t* b;
b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) );
bli_blksz_init
(
b,
b_s, b_d, b_c, b_z,
be_s, be_d, be_c, be_z
);
return b;
}
void bli_blksz_init_ed
(
blksz_t* b,
dim_t b_s, dim_t be_s,
@@ -75,7 +98,45 @@ void bli_blksz_obj_init
b->e[BLIS_DCOMPLEX] = be_z;
}
void bli_blksz_obj_free
void bli_blksz_init
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
)
{
b->v[BLIS_FLOAT] = b_s;
b->v[BLIS_DOUBLE] = b_d;
b->v[BLIS_SCOMPLEX] = b_c;
b->v[BLIS_DCOMPLEX] = b_z;
// Interpret a zero as a request for the default value.
b->e[BLIS_FLOAT] = ( be_s == 0 ? b_s : be_s );
b->e[BLIS_DOUBLE] = ( be_d == 0 ? b_d : be_d );
b->e[BLIS_SCOMPLEX] = ( be_c == 0 ? b_c : be_c );
b->e[BLIS_DCOMPLEX] = ( be_z == 0 ? b_z : be_z );
}
void bli_blksz_init_easy
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z
)
{
b->v[BLIS_FLOAT] = b_s;
b->v[BLIS_DOUBLE] = b_d;
b->v[BLIS_SCOMPLEX] = b_c;
b->v[BLIS_DCOMPLEX] = b_z;
// Here we assume the maximum blocksize values can be the same as the
// default values.
b->e[BLIS_FLOAT] = b_s;
b->e[BLIS_DOUBLE] = b_d;
b->e[BLIS_SCOMPLEX] = b_c;
b->e[BLIS_DCOMPLEX] = b_z;
}
void bli_blksz_free
(
blksz_t* b
)
@@ -302,6 +363,11 @@ dim_t bli_determine_blocksize_b_sub
// chunk that will correspond to the blocksize we are computing now.
dim_left_now = dim - i;
// Sanity check: if dim_left_now is zero, then we can return zero
// without going any further.
if ( dim_left_now == 0 )
return 0;
dim_at_edge = dim_left_now % b_alg;
// If dim_left_now is a multiple of b_alg, we can safely return b_alg

View File

@@ -50,15 +50,6 @@
*(max) = bli_blksz_get_max( dt, b ); \
}
#define bli_blksz_get_def_for_obj( obj, b ) \
\
bli_blksz_get_def( bli_obj_datatype( *(obj) ), b )
#define bli_blksz_get_max_for_obj( obj, b ) \
\
bli_blksz_get_max( bli_obj_datatype( *(obj) ), b )
// blksz_t modification
#define bli_blksz_set_def( val, dt, b ) \
@@ -85,8 +76,11 @@
#define bli_blksz_copy_dt( dt_src, b_src, \
dt_dst, b_dst ) \
{ \
(b_dst)->v[ dt_dst ] = (b_src)->v[ dt_src ]; \
(b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \
const dim_t v_src = bli_blksz_get_def( dt_src, b_src ); \
const dim_t e_src = bli_blksz_get_max( dt_src, b_src ); \
\
bli_blksz_set_def( v_src, dt_dst, b_dst ); \
bli_blksz_set_max( e_src, dt_dst, b_dst ); \
}
#define bli_blksz_scale_def( num, den, dt, b ) \
@@ -109,7 +103,7 @@
// -----------------------------------------------------------------------------
blksz_t* bli_blksz_obj_create
blksz_t* bli_blksz_create_ed
(
dim_t b_s, dim_t be_s,
dim_t b_d, dim_t be_d,
@@ -117,7 +111,13 @@ blksz_t* bli_blksz_obj_create
dim_t b_z, dim_t be_z
);
void bli_blksz_obj_init
blksz_t* bli_blksz_create
(
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
);
void bli_blksz_init_ed
(
blksz_t* b,
dim_t b_s, dim_t be_s,
@@ -126,7 +126,20 @@ void bli_blksz_obj_init
dim_t b_z, dim_t be_z
);
void bli_blksz_obj_free
void bli_blksz_init
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
);
void bli_blksz_init_easy
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z
);
void bli_blksz_free
(
blksz_t* b
);

View File

@@ -34,8 +34,9 @@
#include "blis.h"
cntl_t* bli_cntl_obj_create
cntl_t* bli_cntl_create_node
(
opid_t family,
bszid_t bszid,
void* var_func,
void* params,
@@ -48,6 +49,7 @@ cntl_t* bli_cntl_obj_create
// Allocate the cntl_t struct.
cntl = bli_malloc_intl( sizeof( cntl_t ) );
bli_cntl_set_family( family, cntl );
bli_cntl_set_bszid( bszid, cntl );
bli_cntl_set_var_func( var_func, cntl );
bli_cntl_set_params( params, cntl );
@@ -63,7 +65,7 @@ cntl_t* bli_cntl_obj_create
return cntl;
}
void bli_cntl_obj_free
void bli_cntl_free_node
(
cntl_t* cntl
)
@@ -71,7 +73,7 @@ void bli_cntl_obj_free
bli_free_intl( cntl );
}
void bli_cntl_obj_clear
void bli_cntl_clear_node
(
cntl_t* cntl
)
@@ -141,7 +143,7 @@ void bli_cntl_free_w_thrinfo
}
// Free the current node.
bli_cntl_obj_free( cntl );
bli_cntl_free_node( cntl );
}
void bli_cntl_free_wo_thrinfo
@@ -177,7 +179,7 @@ void bli_cntl_free_wo_thrinfo
}
// Free the current node.
bli_cntl_obj_free( cntl );
bli_cntl_free_node( cntl );
}
// -----------------------------------------------------------------------------
@@ -189,10 +191,11 @@ cntl_t* bli_cntl_copy
{
// Make a copy of the current node. Notice that the source node
// should NOT have any allocated/cached mem_t entries, and that
// bli_cntl_obj_create() creates a node with a cleared mem_t
// bli_cntl_create_node() creates a node with a cleared mem_t
// field.
cntl_t* cntl_copy = bli_cntl_obj_create
cntl_t* cntl_copy = bli_cntl_create_node
(
bli_cntl_family( cntl ),
bli_cntl_bszid( cntl ),
bli_cntl_var_func( cntl ),
NULL, NULL
@@ -234,3 +237,23 @@ cntl_t* bli_cntl_copy
return cntl_copy;
}
void bli_cntl_mark_family
(
opid_t family,
cntl_t* cntl
)
{
// Set the family of the root node.
bli_cntl_set_family( family, cntl );
// Continue as long as the current node has a valid child.
while ( bli_cntl_sub_node( cntl ) != NULL )
{
// Move down the tree to the child node.
cntl = bli_cntl_sub_node( cntl );
// Set the family of the current node.
bli_cntl_set_family( family, cntl );
}
}

View File

@@ -39,6 +39,7 @@
struct cntl_s
{
// Basic fields (usually required).
opid_t family;
bszid_t bszid;
void* var_func;
struct cntl_s* sub_node;
@@ -57,20 +58,21 @@ typedef struct cntl_s cntl_t;
// -- Control tree prototypes --
cntl_t* bli_cntl_obj_create
cntl_t* bli_cntl_create_node
(
opid_t family,
bszid_t bszid,
void* var_func,
void* params,
cntl_t* sub_node
);
void bli_cntl_obj_free
void bli_cntl_free_node
(
cntl_t* cntl
);
void bli_cntl_obj_clear
void bli_cntl_clear_node
(
cntl_t* cntl
);
@@ -99,10 +101,20 @@ cntl_t* bli_cntl_copy
cntl_t* cntl
);
void bli_cntl_mark_family
(
opid_t family,
cntl_t* cntl
);
// -----------------------------------------------------------------------------
// cntl_t query (fields only)
#define bli_cntl_family( cntl ) \
\
( cntl->family )
#define bli_cntl_bszid( cntl ) \
\
( cntl->bszid )
@@ -139,6 +151,11 @@ cntl_t* bli_cntl_copy
// cntl_t modification
#define bli_cntl_set_family( family0, cntl ) \
{ \
cntl->family = family0; \
}
#define bli_cntl_set_bszid( bszid0, cntl ) \
{ \
cntl->bszid = bszid0; \

View File

@@ -39,14 +39,14 @@
// NOTE: Since these functions currently do nothing, they are defined
// as empty macros in bli_cntx.
//
void bli_cntx_obj_create( cntx_t* cntx )
void bli_cntx_create( cntx_t* cntx )
{
// Since cntx_t objects contain statically-allocated arrays,
// we don't need to do anything in order to create the cntx_t
// instance.
}
void bli_cntx_obj_free( cntx_t* cntx )
void bli_cntx_free( cntx_t* cntx )
{
// Just as we don't need to do anything in order to create a
// cntx_t instance, we don't need to do anything to destory
@@ -54,7 +54,7 @@ void bli_cntx_obj_free( cntx_t* cntx )
}
#endif
void bli_cntx_obj_clear( cntx_t* cntx )
void bli_cntx_clear( cntx_t* cntx )
{
// Fill the entire cntx_t structure with zeros.
memset( ( void* )cntx, 0, sizeof( cntx ) );
@@ -108,8 +108,11 @@ void bli_cntx_init( cntx_t* cntx )
// -----------------------------------------------------------------------------
blksz_t* bli_cntx_get_blksz( bszid_t bs_id,
cntx_t* cntx )
blksz_t* bli_cntx_get_blksz
(
bszid_t bs_id,
cntx_t* cntx
)
{
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
blksz_t* blksz = &blkszs[ bs_id ];
@@ -142,8 +145,11 @@ dim_t bli_cntx_get_blksz_max_dt( num_t dt,
}
#endif
blksz_t* bli_cntx_get_bmult( bszid_t bs_id,
cntx_t* cntx )
blksz_t* bli_cntx_get_bmult
(
bszid_t bs_id,
cntx_t* cntx
)
{
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
bszid_t* bmults = bli_cntx_bmults_buf( cntx );
@@ -166,8 +172,11 @@ dim_t bli_cntx_get_bmult_dt( num_t dt,
}
#endif
func_t* bli_cntx_get_l3_ukr( l3ukr_t ukr_id,
cntx_t* cntx )
func_t* bli_cntx_get_l3_ukr
(
l3ukr_t ukr_id,
cntx_t* cntx
)
{
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
@@ -210,8 +219,11 @@ void* bli_cntx_get_l3_ukr_dt( num_t dt,
}
#endif
func_t* bli_cntx_get_l3_vir_ukr( l3ukr_t ukr_id,
cntx_t* cntx )
func_t* bli_cntx_get_l3_vir_ukr
(
l3ukr_t ukr_id,
cntx_t* cntx
)
{
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
func_t* l3_vir_ukr = &l3_vir_ukrs[ ukr_id ];
@@ -235,8 +247,11 @@ void* bli_cntx_get_l3_vir_ukr_dt( num_t dt,
}
#endif
func_t* bli_cntx_get_l3_nat_ukr( l3ukr_t ukr_id,
cntx_t* cntx )
func_t* bli_cntx_get_l3_nat_ukr
(
l3ukr_t ukr_id,
cntx_t* cntx
)
{
func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
func_t* l3_nat_ukr = &l3_nat_ukrs[ ukr_id ];
@@ -260,8 +275,11 @@ void* bli_cntx_get_l3_nat_ukr_dt( num_t dt,
}
#endif
func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id,
cntx_t* cntx )
func_t* bli_cntx_get_l1f_ker
(
l1fkr_t ker_id,
cntx_t* cntx
)
{
func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx );
func_t* l1f_ker = &l1f_kers[ ker_id ];
@@ -283,8 +301,11 @@ void* bli_cntx_get_l1f_ker_dt( num_t dt,
}
#endif
func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id,
cntx_t* cntx )
func_t* bli_cntx_get_l1v_ker
(
l1vkr_t ker_id,
cntx_t* cntx
)
{
func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx );
func_t* l1v_ker = &l1v_kers[ ker_id ];
@@ -306,8 +327,11 @@ void* bli_cntx_get_l1v_ker_dt( num_t dt,
}
#endif
mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id,
cntx_t* cntx )
mbool_t* bli_cntx_get_l3_nat_ukr_prefs
(
l3ukr_t ukr_id,
cntx_t* cntx
)
{
mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
mbool_t* l3_nat_ukrs_pref = &l3_nat_ukrs_prefs[ ukr_id ];
@@ -316,12 +340,30 @@ mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id,
return l3_nat_ukrs_pref;
}
func_t* bli_cntx_get_packm_ukr( cntx_t* cntx )
func_t* bli_cntx_get_packm_ker
(
l1mkr_t ker_id,
cntx_t* cntx
)
{
func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx );
func_t* packm_kers = bli_cntx_packm_kers_buf( cntx );
func_t* packm_ker = &packm_kers[ ker_id ];
// Return the address of the func_t that contains the packm ukernels.
return packm_ukrs;
return packm_ker;
}
func_t* bli_cntx_get_unpackm_ker
(
l1mkr_t ker_id,
cntx_t* cntx
)
{
func_t* unpackm_kers = bli_cntx_unpackm_kers_buf( cntx );
func_t* unpackm_ker = &unpackm_kers[ ker_id ];
// Return the address of the func_t that contains the unpackm ukernels.
return unpackm_ker;
}
#if 0
@@ -360,7 +402,11 @@ dim_t bli_cntx_get_num_threads( cntx_t* cntx )
bli_cntx_ir_way( cntx );
}
dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl )
dim_t bli_cntx_get_num_threads_in
(
cntx_t* cntx,
cntl_t* cntl
)
{
dim_t n_threads_in = 1;
@@ -384,14 +430,6 @@ dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl )
// -----------------------------------------------------------------------------
#if 1
//
// NOTE: This function is disabled because:
// - we currently do not have any need to set a context direclty with
// blksz_t objects
// - it may be broken; it needs to be synced up with the corresponding
// function in bli_gks.c.
//
void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
/* Example prototypes:
@@ -454,8 +492,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// Here, we query the variable argument list for:
// - the bszid_t of the blocksize we're about to process,
// - the address of the blksz_t object, and
// - the bszid_t of the multiple we need to associate with
// the blksz_t object.
// - the bszid_t of the multiple
// that we need to associate with the blksz_t object.
bszid_t bs_id = va_arg( args, bszid_t );
blksz_t* blksz = va_arg( args, blksz_t* );
bszid_t bm_id = va_arg( args, bszid_t );
@@ -473,9 +511,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
// Here, we query the variable argument list for:
// - the bszid_t of the blocksize we're about to process,
// - the address of the blksz_t object, and
// - the bszid_t of the multiple we need to associate with
// the blksz_t object.
// - the address of the blksz_t object,
// - the bszid_t of the multiple, and
// - the scalars we wish to apply to the real blocksizes to
// come up with the induced complex blocksizes (for default
// and maximum blocksizes).
@@ -536,6 +573,7 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// location within the context's blksz_t array. Do the same
// for the blocksize multiple id.
//cntx_blkszs[ bs_id ] = *blksz;
//bli_blksz_copy_smart( blksz, cntx_blksz );
bli_blksz_copy( blksz, cntx_blksz );
// Copy the blocksize multiple id into the context.
@@ -624,14 +662,16 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
bli_free_intl( dsclrs );
bli_free_intl( msclrs );
}
#endif
// -----------------------------------------------------------------------------
void bli_cntx_set_blksz( bszid_t bs_id,
blksz_t* blksz,
bszid_t mult_id,
cntx_t* cntx )
void bli_cntx_set_blksz
(
bszid_t bs_id,
blksz_t* blksz,
bszid_t mult_id,
cntx_t* cntx
)
{
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
bszid_t* bmults = bli_cntx_bmults_buf( cntx );
@@ -645,20 +685,111 @@ void bli_cntx_set_blksz( bszid_t bs_id,
bmults[ bs_id ] = mult_id;
}
void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id,
func_t* func,
cntx_t* cntx )
{
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
// -----------------------------------------------------------------------------
// Copy the function object into the specified location within
// the context's virtual level-3 ukernel array.
l3_vir_ukrs[ ukr_id ] = *func;
void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
{
/* Example prototypes:
void bli_cntx_set_l3_nat_ukrs
(
dim_t n_ukrs,
l3ukr_t ukr0_id, num_t dt0, void* ukr0_fp, bool_t pref0,
l3ukr_t ukr1_id, num_t dt1, void* ukr1_fp, bool_t pref1,
l3ukr_t ukr2_id, num_t dt2, void* ukr2_fp, bool_t pref2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
// Allocate some temporary local arrays.
l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) );
num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) );
void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) );
bool_t* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool_t ) );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_ukrs );
// Process n_ukrs tuples.
for ( i = 0; i < n_ukrs; ++i )
{
// Here, we query the variable argument list for:
// - the l3ukr_t of the kernel we're about to process,
// - the datatype of the kernel,
// - the kernel function pointer, and
// - the kernel function storage preference
// that we need to store to the context.
const l3ukr_t ukr_id = va_arg( args, l3ukr_t );
const num_t ukr_dt = va_arg( args, num_t );
void* ukr_fp = va_arg( args, void* );
const bool_t ukr_pref = va_arg( args, bool_t );
// Store the values in our temporary arrays.
ukr_ids[ i ] = ukr_id;
ukr_dts[ i ] = ukr_dt;
ukr_fps[ i ] = ukr_fp;
ukr_prefs[ i ] = ukr_pref;
}
// The last argument should be the context pointer.
cntx_t* cntx = va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the l3 native ukernel func_t array
// - the l3 native ukernel preferences array
func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_ukrs; ++i )
{
// Read the current blocksize id, blksz_t* pointer, blocksize
// multiple id, and blocksize scalar.
const l3ukr_t ukr_id = ukr_ids[ i ];
const num_t ukr_dt = ukr_dts[ i ];
void* ukr_fp = ukr_fps[ i ];
const bool_t ukr_pref = ukr_prefs[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ];
mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ];
// Store the ukernel function pointer and preference values into
// the context.
bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
}
// Free the temporary local arrays.
bli_free_intl( ukr_ids );
bli_free_intl( ukr_dts );
bli_free_intl( ukr_fps );
bli_free_intl( ukr_prefs );
}
void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id,
func_t* func,
cntx_t* cntx )
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_nat_ukr
(
l3ukr_t ukr_id,
func_t* func,
cntx_t* cntx
)
{
func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
@@ -667,9 +798,12 @@ void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id,
l3_nat_ukrs[ ukr_id ] = *func;
}
void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id,
mbool_t* prefs,
cntx_t* cntx )
void bli_cntx_set_l3_nat_ukr_prefs
(
l3ukr_t ukr_id,
mbool_t* prefs,
cntx_t* cntx
)
{
mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
@@ -678,9 +812,26 @@ void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id,
l3_nat_ukrs_prefs[ ukr_id ] = *prefs;
}
void bli_cntx_set_l1f_ker( l1fkr_t ker_id,
func_t* func,
cntx_t* cntx )
void bli_cntx_set_l3_vir_ukr
(
l3ukr_t ukr_id,
func_t* func,
cntx_t* cntx
)
{
func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
// Copy the function object into the specified location within
// the context's virtual level-3 ukernel array.
l3_vir_ukrs[ ukr_id ] = *func;
}
void bli_cntx_set_l1f_ker
(
l1fkr_t ker_id,
func_t* func,
cntx_t* cntx
)
{
func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx );
@@ -689,9 +840,12 @@ void bli_cntx_set_l1f_ker( l1fkr_t ker_id,
l1f_kers[ ker_id ] = *func;
}
void bli_cntx_set_l1v_ker( l1vkr_t ker_id,
func_t* func,
cntx_t* cntx )
void bli_cntx_set_l1v_ker
(
l1vkr_t ker_id,
func_t* func,
cntx_t* cntx
)
{
func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx );
@@ -700,43 +854,154 @@ void bli_cntx_set_l1v_ker( l1vkr_t ker_id,
l1v_kers[ ker_id ] = *func;
}
void bli_cntx_set_packm_ukr( func_t* func,
cntx_t* cntx )
{
func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx );
// -----------------------------------------------------------------------------
// Copy the function object into the context's packm ukernel object.
*packm_ukrs = *func;
void bli_cntx_set_packm_kers( dim_t n_kers, ... )
{
/* Example prototypes:
void bli_cntx_set_packm_kers
(
dim_t n_ukrs,
l1mkr_t ker0_id, num_t ker0_dt, void* ker0_fp,
l1mkr_t ker1_id, num_t ker1_dt, void* ker1_fp,
l1mkr_t ker2_id, num_t ker2_dt, void* ker2_fp,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
// Allocate some temporary local arrays.
l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ) );
num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) );
void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_kers );
// Process n_kers tuples.
for ( i = 0; i < n_kers; ++i )
{
// Here, we query the variable argument list for:
// - the l1mkr_t of the kernel we're about to process,
// - the datatype of the kernel, and
// - the kernel function pointer
// that we need to store to the context.
const l1mkr_t ker_id = va_arg( args, l1mkr_t );
const num_t ker_dt = va_arg( args, num_t );
void* ker_fp = va_arg( args, void* );
// Store the values in our temporary arrays.
ker_ids[ i ] = ker_id;
ker_dts[ i ] = ker_dt;
ker_fps[ i ] = ker_fp;
}
// The last argument should be the context pointer.
cntx_t* cntx = va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the address of:
// - the packm kernels func_t array
func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_kers; ++i )
{
// Read the current blocksize id, blksz_t* pointer, blocksize
// multiple id, and blocksize scalar.
const l1mkr_t ker_id = ker_ids[ i ];
const num_t ker_dt = ker_dts[ i ];
void* ker_fp = ker_fps[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* kers = &cntx_packm_kers[ ker_id ];
// Store the ukernel function pointer and preference values into
// the context.
bli_func_set_dt( ker_fp, ker_dt, kers );
}
// Free the temporary local arrays.
bli_free_intl( ker_ids );
bli_free_intl( ker_dts );
bli_free_intl( ker_fps );
}
void bli_cntx_set_ind_method( ind_t method,
cntx_t* cntx )
// -----------------------------------------------------------------------------
void bli_cntx_set_packm_ker
(
l1mkr_t ker_id,
func_t* func,
cntx_t* cntx
)
{
func_t* packm_kers = bli_cntx_packm_kers_buf( cntx );
// Copy the function object into the specified location within
// the context's packm kernel array.
packm_kers[ ker_id ] = *func;
}
// -----------------------------------------------------------------------------
void bli_cntx_set_ind_method
(
ind_t method,
cntx_t* cntx
)
{
bli_cntx_set_method( method, cntx );
}
void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a,
pack_t schema_b,
cntx_t* cntx )
void bli_cntx_set_pack_schema_ab_blockpanel
(
pack_t schema_a,
pack_t schema_b,
cntx_t* cntx
)
{
bli_cntx_set_schema_a_block( schema_a, cntx );
bli_cntx_set_schema_b_panel( schema_b, cntx );
}
void bli_cntx_set_pack_schema_a_block( pack_t schema_a,
cntx_t* cntx )
void bli_cntx_set_pack_schema_a_block
(
pack_t schema_a,
cntx_t* cntx
)
{
bli_cntx_set_schema_a_block( schema_a, cntx );
}
void bli_cntx_set_pack_schema_b_panel( pack_t schema_b,
cntx_t* cntx )
void bli_cntx_set_pack_schema_b_panel
(
pack_t schema_b,
cntx_t* cntx
)
{
bli_cntx_set_schema_b_panel( schema_b, cntx );
}
void bli_cntx_set_pack_schema_c_panel( pack_t schema_c,
cntx_t* cntx )
void bli_cntx_set_pack_schema_c_panel
(
pack_t schema_c,
cntx_t* cntx
)
{
bli_cntx_set_schema_c_panel( schema_c, cntx );
}
@@ -749,17 +1014,24 @@ void bli_cntx_set_ukr_anti_pref( bool_t anti_pref,
}
#endif
void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
dim_t m, dim_t n, dim_t k )
void bli_cntx_set_thrloop_from_env
(
opid_t l3_op,
side_t side,
cntx_t* cntx,
dim_t m,
dim_t n,
dim_t k
)
{
dim_t jc, pc, ic, jr, ir;
#ifdef BLIS_ENABLE_MULTITHREADING
int nthread = bli_env_read_nway( "BLIS_NUM_THREADS", -1 );
int nthread = bli_thread_get_env( "BLIS_NUM_THREADS", -1 );
if ( nthread == -1 )
nthread = bli_env_read_nway( "OMP_NUM_THREADS", -1 );
nthread = bli_thread_get_env( "OMP_NUM_THREADS", -1 );
if ( nthread < 1 ) nthread = 1;
@@ -786,10 +1058,10 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
pc = 1;
dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 );
dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 );
dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 );
dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 );
dim_t jc_env = bli_thread_get_env( "BLIS_JC_NT", -1 );
dim_t ic_env = bli_thread_get_env( "BLIS_IC_NT", -1 );
dim_t jr_env = bli_thread_get_env( "BLIS_JR_NT", -1 );
dim_t ir_env = bli_thread_get_env( "BLIS_IR_NT", -1 );
if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1)
{
@@ -882,9 +1154,12 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx,
// -----------------------------------------------------------------------------
bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt
(
num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx );
bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs );
@@ -894,9 +1169,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt,
return ukr_prefs == TRUE;
}
bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt
(
num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx );
bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs );
@@ -906,16 +1184,22 @@ bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt,
return ukr_prefs == FALSE;
}
bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_nat_ukr_prefers_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
return !bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
}
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
const num_t dt = bli_obj_datatype( *obj );
const bool_t ukr_prefers_rows
@@ -930,9 +1214,12 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
@@ -942,9 +1229,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx );
@@ -956,9 +1246,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj,
// -----------------------------------------------------------------------------
bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_ukr_prefers_rows_dt
(
num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
// Reference the ukr storage preferences of the corresponding real
// micro-kernel for induced methods.
@@ -968,9 +1261,12 @@ bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt,
return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
}
bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_ukr_prefers_cols_dt
(
num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
// Reference the ukr storage preferences of the corresponding real
// micro-kernel for induced methods.
@@ -980,16 +1276,22 @@ bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt,
return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
}
bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_ukr_prefers_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
}
bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_ukr_dislikes_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
num_t dt = bli_obj_datatype( *obj );
@@ -1005,9 +1307,12 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_ukr_eff_prefers_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx );
@@ -1017,9 +1322,12 @@ bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj,
return r_val;
}
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx )
bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of
(
obj_t* obj,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx );
@@ -1108,23 +1416,6 @@ void bli_cntx_print( cntx_t* cntx )
);
}
{
func_t* ukr = bli_cntx_get_packm_ukr( cntx );
printf( "packm ker : %16p %16p %16p %16p\n",
bli_func_get_dt( BLIS_FLOAT, ukr ),
bli_func_get_dt( BLIS_DOUBLE, ukr ),
bli_func_get_dt( BLIS_SCOMPLEX, ukr ),
bli_func_get_dt( BLIS_DCOMPLEX, ukr )
);
}
{
ind_t family = bli_cntx_get_family( cntx );
printf( "oper family : %lu\n", ( guint_t )family );
}
{
ind_t method = bli_cntx_get_ind_method( cntx );

View File

@@ -36,6 +36,9 @@
#ifndef BLIS_CNTX_H
#define BLIS_CNTX_H
//#include "bli_cntx_init.h"
// Context object type (defined in bli_type_defs.h)
/*
@@ -51,9 +54,9 @@ typedef struct cntx_s
func_t* l1f_kers;
func_t* l1v_kers;
func_t packm_ukrs;
func_t* packm_kers;
func_t* unpackm_kers;
opid_t family;
ind_t method;
pack_t schema_a;
pack_t schema_b;
@@ -99,17 +102,13 @@ typedef struct cntx_s
\
( (cntx)->l1v_kers )
#define bli_cntx_packm_ukrs_buf( cntx ) \
#define bli_cntx_packm_kers_buf( cntx ) \
\
(&((cntx)->packm_ukrs) )
( (cntx)->packm_kers )
#define bli_cntx_packm_ukrs( cntx ) \
#define bli_cntx_unpackm_kers_buf( cntx ) \
\
(&((cntx)->packm_ukrs) )
#define bli_cntx_family( cntx ) \
\
( (cntx)->family )
( (cntx)->unpackm_kers )
#define bli_cntx_method( cntx ) \
\
@@ -202,16 +201,6 @@ typedef struct cntx_s
(cntx_p)->l1v_kers = _l1v_kers; \
}
#define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \
{ \
(cntx_p)->packm_ukrs = _packm_ukrs; \
}
#define bli_cntx_set_family( _family, cntx_p ) \
{ \
(cntx_p)->family = _family; \
}
#define bli_cntx_set_method( _method, cntx_p ) \
{ \
(cntx_p)->method = _method; \
@@ -285,7 +274,8 @@ typedef struct cntx_s
( \
(dt), \
&(( \
bli_cntx_method( (cntx) ) != BLIS_NAT \
bli_cntx_method( (cntx) ) != BLIS_NAT && \
bli_is_complex( dt ) \
? bli_cntx_l3_vir_ukrs_buf( (cntx) ) \
: bli_cntx_l3_nat_ukrs_buf( (cntx) ) \
)[ ukr_id ]) \
@@ -326,10 +316,6 @@ typedef struct cntx_s
(dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \
)
#define bli_cntx_get_family( cntx ) \
\
bli_cntx_family( cntx )
#define bli_cntx_get_ind_method( cntx ) \
\
bli_cntx_method( cntx )
@@ -357,9 +343,9 @@ typedef struct cntx_s
// create/free
//void bli_cntx_obj_create( cntx_t* cntx );
//void bli_cntx_obj_free( cntx_t* cntx );
void bli_cntx_obj_clear( cntx_t* cntx );
//void bli_cntx_create( cntx_t* cntx );
//void bli_cntx_free( cntx_t* cntx );
void bli_cntx_clear( cntx_t* cntx );
void bli_cntx_init( cntx_t* cntx );
// get functions
@@ -380,7 +366,7 @@ func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id,
cntx_t* cntx );
func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id,
cntx_t* cntx );
func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
//func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
//dim_t bli_cntx_get_blksz_def_dt( num_t dt,
// bszid_t bs_id,
@@ -409,6 +395,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx );
//void* bli_cntx_get_l1v_ker_dt( num_t dt,
// l1vkr_t ker_id,
// cntx_t* cntx );
func_t* bli_cntx_get_packm_ker( l1mkr_t ker_id,
cntx_t* cntx );
func_t* bli_cntx_get_unpackm_ker( l1mkr_t ker_id,
cntx_t* cntx );
//ind_t bli_cntx_get_ind_method( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx );
//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx );
@@ -425,18 +415,34 @@ void bli_cntx_set_blksz( bszid_t bs_id,
blksz_t* blksz,
bszid_t mult_id,
cntx_t* cntx );
void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id,
func_t* func,
cntx_t* cntx );
void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id,
func_t* func,
cntx_t* cntx );
void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id,
mbool_t* prefs,
cntx_t* cntx );
void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id,
func_t* func,
cntx_t* cntx );
void bli_cntx_set_l1f_ker( l1fkr_t ker_id,
func_t* func,
cntx_t* cntx );
void bli_cntx_set_l1v_ker( l1vkr_t ker_id,
func_t* func,
cntx_t* cntx );
void bli_cntx_set_packm_kers( dim_t n_kers, ... );
void bli_cntx_set_packm_ker( l1mkr_t ker_id,
func_t* func,
cntx_t* cntx );
void bli_cntx_set_packm_ukr( func_t* func,
cntx_t* cntx );
void bli_cntx_set_ind_method( ind_t method,
@@ -507,11 +513,11 @@ void bli_cntx_print( cntx_t* cntx );
// Preprocess out these calls entirely, since they are currently just empty
// functions that do nothing.
#if 0
#define bli_cntx_obj_create( cntx ) { bli_cntx_obj_clear( cntx ); }
#define bli_cntx_obj_free( cntx ) { bli_cntx_obj_clear( cntx ); }
#define bli_cntx_create( cntx ) { bli_cntx_clear( cntx ); }
#define bli_cntx_free( cntx ) { bli_cntx_clear( cntx ); }
#else
#define bli_cntx_obj_create( cntx ) { ; }
#define bli_cntx_obj_free( cntx ) { ; }
#define bli_cntx_create( cntx ) { ; }
#define bli_cntx_free( cntx ) { ; }
#endif
// These macros initialize/finalize a local context if the given context

View File

@@ -35,37 +35,57 @@
#include "blis.h"
func_t* bli_func_obj_create( void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z )
func_t* bli_func_create
(
void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z
)
{
func_t* f;
f = ( func_t* ) bli_malloc_intl( sizeof(func_t) );
bli_func_obj_init( f,
ptr_s,
ptr_d,
ptr_c,
ptr_z );
bli_func_init
(
f,
ptr_s,
ptr_d,
ptr_c,
ptr_z
);
return f;
}
void bli_func_obj_init( func_t* f,
void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z )
void bli_func_init
(
func_t* f,
void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z
)
{
f->ptr[BLIS_BITVAL_FLOAT_TYPE] = ptr_s;
f->ptr[BLIS_BITVAL_DOUBLE_TYPE] = ptr_d;
f->ptr[BLIS_BITVAL_SCOMPLEX_TYPE] = ptr_c;
f->ptr[BLIS_BITVAL_DCOMPLEX_TYPE] = ptr_z;
bli_func_set_dt( ptr_s, BLIS_FLOAT, f );
bli_func_set_dt( ptr_d, BLIS_DOUBLE, f );
bli_func_set_dt( ptr_c, BLIS_SCOMPLEX, f );
bli_func_set_dt( ptr_z, BLIS_DCOMPLEX, f );
}
void bli_func_obj_free( func_t* f )
void bli_func_init_null
(
func_t* f
)
{
bli_func_set_dt( NULL, BLIS_FLOAT, f );
bli_func_set_dt( NULL, BLIS_DOUBLE, f );
bli_func_set_dt( NULL, BLIS_SCOMPLEX, f );
bli_func_set_dt( NULL, BLIS_DCOMPLEX, f );
}
void bli_func_free( func_t* f )
{
bli_free_intl( f );
}
@@ -75,7 +95,7 @@ void bli_func_obj_free( func_t* f )
bool_t bli_func_is_null_dt( num_t dt,
func_t* f )
{
return ( f->ptr[ dt ] == NULL );
return ( bli_func_get_dt( dt, f ) == NULL );
}
bool_t bli_func_is_null( func_t* f )
@@ -87,7 +107,7 @@ bool_t bli_func_is_null( func_t* f )
// return FALSE. Otherwise, if they are all null, return TRUE.
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
if ( f->ptr[ dt ] != NULL )
if ( bli_func_get_dt( dt, f ) != NULL )
{
r_val = FALSE;
break;

View File

@@ -49,18 +49,29 @@
// -----------------------------------------------------------------------------
func_t* bli_func_obj_create( void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z );
func_t* bli_func_create
(
void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z
);
void bli_func_obj_init( func_t* f,
void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z );
void bli_func_init
(
func_t* f,
void* ptr_s,
void* ptr_d,
void* ptr_c,
void* ptr_z
);
void bli_func_obj_free( func_t* f );
void bli_func_init_null
(
func_t* f
);
void bli_func_free( func_t* f );
// -----------------------------------------------------------------------------

View File

@@ -74,12 +74,6 @@ static blksz_t bli_gks_blkszs[BLIS_NUM_BLKSZS] =
/* df */ { { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, },
{ BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, }
},
/* xf */ { { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, },
{ BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, }
},
/* vf */ { { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, },
{ BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, }
},
};
// -----------------------------------------------------------------------------

View File

@@ -35,29 +35,38 @@
#include "blis.h"
mbool_t* bli_mbool_obj_create( bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z )
mbool_t* bli_mbool_create
(
bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z
)
{
mbool_t* b;
b = ( mbool_t* ) bli_malloc_intl( sizeof(mbool_t) );
bli_mbool_obj_init( b,
b_s,
b_d,
b_c,
b_z );
bli_mbool_init
(
b,
b_s,
b_d,
b_c,
b_z
);
return b;
}
void bli_mbool_obj_init( mbool_t* b,
bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z )
void bli_mbool_init
(
mbool_t* b,
bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z
)
{
bli_mbool_set_dt( b_s, BLIS_FLOAT, b );
bli_mbool_set_dt( b_d, BLIS_DOUBLE, b );
@@ -65,7 +74,7 @@ void bli_mbool_obj_init( mbool_t* b,
bli_mbool_set_dt( b_z, BLIS_DCOMPLEX, b );
}
void bli_mbool_obj_free( mbool_t* b )
void bli_mbool_free( mbool_t* b )
{
bli_free_intl( b );
}

View File

@@ -49,16 +49,22 @@
// -----------------------------------------------------------------------------
mbool_t* bli_mbool_obj_create( bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z );
mbool_t* bli_mbool_create
(
bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z
);
void bli_mbool_obj_init( mbool_t* b,
bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z );
void bli_mbool_init
(
mbool_t* b,
bool_t b_s,
bool_t b_d,
bool_t b_c,
bool_t b_z
);
void bli_mbool_obj_free( mbool_t* b );
void bli_mbool_free( mbool_t* b );

View File

@@ -44,6 +44,7 @@ void bli_membrk_init
bli_mutex_init( bli_membrk_mutex( membrk ) );
bli_membrk_init_pools( cntx, membrk );
bli_membrk_set_malloc_fp( bli_malloc_pool, membrk );
bli_membrk_set_free_fp( bli_free_pool, membrk );
}
void bli_membrk_finalize

View File

@@ -41,7 +41,12 @@
-lf2c -lm (in that order)
*/
bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len)
#ifdef LAPACK_ILP64
long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len)
#else
int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len)
#endif
{
/* System generated locals */
bla_logical ret_val;
@@ -115,11 +120,11 @@ bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, f
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
/* upper case 'Z'. */
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta
>= 162 && inta <= 169)) {
inta += 64;
}
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb
>= 162 && intb <= 169)) {
intb += 64;
}

View File

@@ -34,6 +34,10 @@
#ifdef BLIS_ENABLE_BLAS2BLIS
bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len);
#ifdef LAPACK_ILP64
long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len);
#else
int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len);
#endif
#endif

View File

@@ -41,6 +41,7 @@
#include <string.h>
#include <stdarg.h>
#include <float.h>
#include <errno.h>
// Determine if we are on a 64-bit or 32-bit architecture
#if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \
@@ -66,6 +67,8 @@
#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
defined(__bsdi__) || defined(__DragonFly__)
#define BLIS_OS_BSD 1
#elif defined(EMSCRIPTEN)
#define BLIS_OS_EMSCRIPTEN
#else
#error "Cannot determine operating system"
#endif

View File

@@ -438,7 +438,7 @@ typedef enum
BLIS_INT = BLIS_BITVAL_INT_TYPE,
BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE,
BLIS_DT_LO = BLIS_FLOAT,
BLIS_DT_HI = BLIS_DCOMPLEX,
BLIS_DT_HI = BLIS_DCOMPLEX
} num_t;
typedef enum
@@ -482,7 +482,7 @@ typedef enum
BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E,
BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E,
BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R,
BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R,
BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R
} pack_t;
// We combine row and column packing into one "type", and we start
@@ -511,7 +511,7 @@ typedef enum
BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL,
BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL,
BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE,
BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE
} packbuf_t;
@@ -590,7 +590,7 @@ typedef enum
BLIS_4M1B,
BLIS_4M1A,
BLIS_1M,
BLIS_NAT,
BLIS_NAT
} ind_t;
#define BLIS_NUM_IND_METHODS (BLIS_NAT+1)
@@ -613,7 +613,7 @@ typedef enum
BLIS_SETV_KER,
BLIS_SUBV_KER,
BLIS_SWAPV_KER,
BLIS_XPBYV_KER,
BLIS_XPBYV_KER
} l1vkr_t;
#define BLIS_NUM_LEVEL1V_KERS 14
@@ -625,19 +625,93 @@ typedef enum
BLIS_DOTAXPYV_KER,
BLIS_AXPYF_KER,
BLIS_DOTXF_KER,
BLIS_DOTXAXPYF_KER,
BLIS_DOTXAXPYF_KER
} l1fkr_t;
#define BLIS_NUM_LEVEL1F_KERS 5
typedef enum
{
BLIS_PACKM_0XK_KER = 0,
BLIS_PACKM_1XK_KER = 1,
BLIS_PACKM_2XK_KER = 2,
BLIS_PACKM_3XK_KER = 3,
BLIS_PACKM_4XK_KER = 4,
BLIS_PACKM_5XK_KER = 5,
BLIS_PACKM_6XK_KER = 6,
BLIS_PACKM_7XK_KER = 7,
BLIS_PACKM_8XK_KER = 8,
BLIS_PACKM_9XK_KER = 9,
BLIS_PACKM_10XK_KER = 10,
BLIS_PACKM_11XK_KER = 11,
BLIS_PACKM_12XK_KER = 12,
BLIS_PACKM_13XK_KER = 13,
BLIS_PACKM_14XK_KER = 14,
BLIS_PACKM_15XK_KER = 15,
BLIS_PACKM_16XK_KER = 16,
BLIS_PACKM_17XK_KER = 17,
BLIS_PACKM_18XK_KER = 18,
BLIS_PACKM_19XK_KER = 19,
BLIS_PACKM_20XK_KER = 20,
BLIS_PACKM_21XK_KER = 21,
BLIS_PACKM_22XK_KER = 22,
BLIS_PACKM_23XK_KER = 23,
BLIS_PACKM_24XK_KER = 24,
BLIS_PACKM_25XK_KER = 25,
BLIS_PACKM_26XK_KER = 26,
BLIS_PACKM_27XK_KER = 27,
BLIS_PACKM_28XK_KER = 28,
BLIS_PACKM_29XK_KER = 29,
BLIS_PACKM_30XK_KER = 30,
BLIS_PACKM_31XK_KER = 31,
BLIS_UNPACKM_0XK_KER = 0,
BLIS_UNPACKM_1XK_KER = 1,
BLIS_UNPACKM_2XK_KER = 2,
BLIS_UNPACKM_3XK_KER = 3,
BLIS_UNPACKM_4XK_KER = 4,
BLIS_UNPACKM_5XK_KER = 5,
BLIS_UNPACKM_6XK_KER = 6,
BLIS_UNPACKM_7XK_KER = 7,
BLIS_UNPACKM_8XK_KER = 8,
BLIS_UNPACKM_9XK_KER = 9,
BLIS_UNPACKM_10XK_KER = 10,
BLIS_UNPACKM_11XK_KER = 11,
BLIS_UNPACKM_12XK_KER = 12,
BLIS_UNPACKM_13XK_KER = 13,
BLIS_UNPACKM_14XK_KER = 14,
BLIS_UNPACKM_15XK_KER = 15,
BLIS_UNPACKM_16XK_KER = 16,
BLIS_UNPACKM_17XK_KER = 17,
BLIS_UNPACKM_18XK_KER = 18,
BLIS_UNPACKM_19XK_KER = 19,
BLIS_UNPACKM_20XK_KER = 20,
BLIS_UNPACKM_21XK_KER = 21,
BLIS_UNPACKM_22XK_KER = 22,
BLIS_UNPACKM_23XK_KER = 23,
BLIS_UNPACKM_24XK_KER = 24,
BLIS_UNPACKM_25XK_KER = 25,
BLIS_UNPACKM_26XK_KER = 26,
BLIS_UNPACKM_27XK_KER = 27,
BLIS_UNPACKM_28XK_KER = 28,
BLIS_UNPACKM_29XK_KER = 29,
BLIS_UNPACKM_30XK_KER = 30,
BLIS_UNPACKM_31XK_KER = 31
} l1mkr_t;
#define BLIS_NUM_PACKM_KERS 32
#define BLIS_NUM_UNPACKM_KERS 32
typedef enum
{
BLIS_GEMM_UKR = 0,
BLIS_GEMMTRSM_L_UKR,
BLIS_GEMMTRSM_U_UKR,
BLIS_TRSM_L_UKR,
BLIS_TRSM_U_UKR,
BLIS_TRSM_U_UKR
} l3ukr_t;
#define BLIS_NUM_LEVEL3_UKRS 5
@@ -648,7 +722,7 @@ typedef enum
BLIS_REFERENCE_UKERNEL = 0,
BLIS_VIRTUAL_UKERNEL,
BLIS_OPTIMIZED_UKERNEL,
BLIS_NOTAPPLIC_UKERNEL,
BLIS_NOTAPPLIC_UKERNEL
} kimpl_t;
#define BLIS_NUM_UKR_IMPL_TYPES 4
@@ -662,7 +736,7 @@ typedef enum
BLIS_IC_IDX,
BLIS_JR_IDX,
BLIS_IR_IDX,
BLIS_PR_IDX,
BLIS_PR_IDX
} thridx_t;
#endif
@@ -683,7 +757,7 @@ typedef enum
// value that can be subtracted from the opid_t value to map it
// to a zero-based range.
// This is needed because these level-3 opid_t values are used in
// bli_ind_query.c to index into arrays.
// bli_l3_ind.c to index into arrays.
//
BLIS_GEMM = 0,
BLIS_HEMM,
@@ -696,7 +770,7 @@ typedef enum
BLIS_TRMM,
BLIS_TRSM,
BLIS_NOID,
BLIS_NOID
} opid_t;
#define BLIS_NUM_LEVEL3_OPS 10
@@ -714,16 +788,14 @@ typedef enum
BLIS_NC,
BLIS_M2, // level-2 blocksize in m dimension
BLIS_N2, // level-2 blocksize in n dimension
BLIS_1F, // level-1f global fusing factor
BLIS_AF, // level-1f axpyf fusing factor
BLIS_DF, // level-1f dotxf fusing factor
BLIS_XF, // level-1f dotxaxpyf fusing factor
BLIS_VF, // level-1v vector fusing factor
BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable.
BLIS_NO_PART // used as a placeholder when blocksizes are not applicable.
} bszid_t;
#define BLIS_NUM_BLKSZS 13
#define BLIS_NUM_BLKSZS 11
//
@@ -784,6 +856,7 @@ typedef struct mem_s
struct cntl_s
{
// Basic fields (usually required).
opid_t family;
bszid_t bszid;
void* var_func;
struct cntl_s* sub_node;
@@ -971,9 +1044,9 @@ typedef struct cntx_s
func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ];
func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ];
func_t packm_ukrs;
func_t packm_kers[ BLIS_NUM_PACKM_KERS ];
func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ];
opid_t family;
ind_t method;
pack_t schema_a_block;
pack_t schema_b_panel;
@@ -992,7 +1065,7 @@ typedef struct cntx_s
typedef enum
{
BLIS_NO_ERROR_CHECKING = 0,
BLIS_FULL_ERROR_CHECKING,
BLIS_FULL_ERROR_CHECKING
} errlev_t;
typedef enum

View File

@@ -122,7 +122,7 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_3M1;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -170,7 +170,7 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_3M2;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -218,7 +218,7 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_3M3;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -279,7 +279,7 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_3MH;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -343,7 +343,7 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_4M1A;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -391,7 +391,7 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_4M1B;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -439,7 +439,7 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_4MH;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -524,7 +524,7 @@ void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx )
const ind_t method = BLIS_1M;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.

View File

@@ -41,7 +41,7 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_3M1;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -89,7 +89,7 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_4M1A;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.
@@ -137,7 +137,7 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx )
const ind_t method = BLIS_1M;
// Clear the context fields.
bli_cntx_obj_clear( cntx );
bli_cntx_clear( cntx );
// Initialize the context with the current architecture's native
// level-3 gemm micro-kernel, and its output preferences.

View File

@@ -36,19 +36,82 @@
void* bli_thrcomm_bcast
(
thrcomm_t* communicator,
thrcomm_t* comm,
dim_t id,
void* to_send
)
{
if ( communicator == NULL || communicator->n_threads == 1 ) return to_send;
if ( comm == NULL || comm->n_threads == 1 ) return to_send;
if ( id == 0 ) communicator->sent_object = to_send;
if ( id == 0 ) comm->sent_object = to_send;
bli_thrcomm_barrier( communicator, id );
void* object = communicator->sent_object;
bli_thrcomm_barrier( communicator, id );
bli_thrcomm_barrier( comm, id );
void* object = comm->sent_object;
bli_thrcomm_barrier( comm, id );
return object;
}
// Use __sync_* builtins (assumed available) if __atomic_* ones are not present.
#ifndef __ATOMIC_RELAXED
#define __ATOMIC_RELAXED
#define __ATOMIC_ACQUIRE
#define __ATOMIC_RELEASE
#define __ATOMIC_ACQ_REL
#define __atomic_load_n(ptr, constraint) \
__sync_fetch_and_add(ptr, 0)
#define __atomic_add_fetch(ptr, value, constraint) \
__sync_add_and_fetch(ptr, value)
#define __atomic_fetch_add(ptr, value, constraint) \
__sync_fetch_and_add(ptr, value)
#define __atomic_fetch_xor(ptr, value, constraint) \
__sync_fetch_and_xor(ptr, value)
#endif
void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id )
{
// Return early if the comm is NULL or if there is only one
// thread participating.
if ( comm == NULL || comm->n_threads == 1 ) return;
// Read the "sense" variable. This variable is akin to a unique ID for
// the current barrier. The first n-1 threads will spin on this variable
// until it changes. The sense variable gets incremented by the last
// thread to enter the barrier, just before it exits. But it turns out
// that you don't need many unique IDs before you can wrap around. In
// fact, if everything else is working, a binary variable is sufficient,
// which is what we do here (i.e., 0 is incremented to 1, which is then
// decremented back to 0, and so forth).
bool_t orig_sense = __atomic_load_n( &comm->barrier_sense, __ATOMIC_RELAXED );
// Register ourselves (the current thread) as having arrived by
// incrementing the barrier_threads_arrived variable. We must perform
// this increment (and a subsequent read) atomically.
dim_t my_threads_arrived =
__atomic_add_fetch( &comm->barrier_threads_arrived, 1, __ATOMIC_ACQ_REL );
// If the current thread was the last thread to have arrived, then
// it will take actions that effectively ends and resets the barrier.
if ( my_threads_arrived == comm->n_threads )
{
// Reset the variable tracking the number of threads that have arrived
// to zero (which returns the barrier to the "empty" state. Then
// atomically toggle the barrier sense variable. This will signal to
// the other threads (which are spinning in the branch elow) that it
// is now safe to exit the barrier.
comm->barrier_threads_arrived = 0;
__atomic_fetch_xor( &comm->barrier_sense, 1, __ATOMIC_RELEASE );
}
else
{
// If the current thread is NOT the last thread to have arrived, then
// it spins on the sense variable until that sense variable changes at
// which time these threads will exit the barrier.
while ( __atomic_load_n( &comm->barrier_sense, __ATOMIC_ACQUIRE ) == orig_sense )
; // Empty loop body.
}
}

View File

@@ -49,11 +49,13 @@
// Thread communicator prototypes.
thrcomm_t* bli_thrcomm_create( dim_t n_threads );
void bli_thrcomm_free( thrcomm_t* communicator );
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads );
void bli_thrcomm_cleanup( thrcomm_t* communicator );
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t thread_id );
void* bli_thrcomm_bcast( thrcomm_t* communicator, dim_t inside_id, void* to_send );
void bli_thrcomm_free( thrcomm_t* comm );
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads );
void bli_thrcomm_cleanup( thrcomm_t* comm );
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t thread_id );
void* bli_thrcomm_bcast( thrcomm_t* comm, dim_t inside_id, void* to_send );
void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id );
#endif

View File

@@ -44,63 +44,66 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads )
return comm;
}
void bli_thrcomm_free( thrcomm_t* communicator )
void bli_thrcomm_free( thrcomm_t* comm )
{
if ( communicator == NULL ) return;
bli_thrcomm_cleanup( communicator );
bli_free_intl( communicator );
if ( comm == NULL ) return;
bli_thrcomm_cleanup( comm );
bli_free_intl( comm );
}
#ifndef BLIS_TREE_BARRIER
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
{
if ( communicator == NULL ) return;
communicator->sent_object = NULL;
communicator->n_threads = n_threads;
communicator->barrier_sense = 0;
communicator->barrier_threads_arrived = 0;
if ( comm == NULL ) return;
comm->sent_object = NULL;
comm->n_threads = n_threads;
comm->barrier_sense = 0;
comm->barrier_threads_arrived = 0;
}
void bli_thrcomm_cleanup( thrcomm_t* communicator )
void bli_thrcomm_cleanup( thrcomm_t* comm )
{
if ( communicator == NULL ) return;
if ( comm == NULL ) return;
}
//'Normal' barrier for openmp
//barrier routine taken from art of multicore programming
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id )
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id )
{
if( communicator == NULL || communicator->n_threads == 1 )
#if 0
if ( comm == NULL || comm->n_threads == 1 )
return;
bool_t my_sense = communicator->barrier_sense;
bool_t my_sense = comm->barrier_sense;
dim_t my_threads_arrived;
_Pragma( "omp atomic capture" )
my_threads_arrived = ++(communicator->barrier_threads_arrived);
my_threads_arrived = ++(comm->barrier_threads_arrived);
if ( my_threads_arrived == communicator->n_threads )
if ( my_threads_arrived == comm->n_threads )
{
communicator->barrier_threads_arrived = 0;
communicator->barrier_sense = !communicator->barrier_sense;
comm->barrier_threads_arrived = 0;
comm->barrier_sense = !comm->barrier_sense;
}
else
{
volatile bool_t* listener = &communicator->barrier_sense;
volatile bool_t* listener = &comm->barrier_sense;
while ( *listener == my_sense ) {}
}
#endif
bli_thrcomm_barrier_atomic( comm, t_id );
}
#else
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
{
if ( communicator == NULL ) return;
communicator->sent_object = NULL;
communicator->n_threads = n_threads;
communicator->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads );
bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 );
if ( comm == NULL ) return;
comm->sent_object = NULL;
comm->n_threads = n_threads;
comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads );
bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 );
}
//Tree barrier used for Intel Xeon Phi
@@ -145,14 +148,14 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_
return me;
}
void bli_thrcomm_cleanup( thrcomm_t* communicator )
void bli_thrcomm_cleanup( thrcomm_t* comm )
{
if ( communicator == NULL ) return;
for ( dim_t i = 0; i < communicator->n_threads; i++ )
if ( comm == NULL ) return;
for ( dim_t i = 0; i < comm->n_threads; i++ )
{
bli_thrcomm_tree_barrier_free( communicator->barriers[i] );
bli_thrcomm_tree_barrier_free( comm->barriers[i] );
}
bli_free_intl( communicator->barriers );
bli_free_intl( comm->barriers );
}
void bli_thrcomm_tree_barrier_free( barrier_t* barrier )
@@ -204,6 +207,7 @@ void bli_thrcomm_tree_barrier( barrier_t* barack )
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
@@ -231,7 +235,7 @@ void bli_l3_thread_decorator
thrinfo_t* thread;
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
@@ -249,7 +253,7 @@ void bli_l3_thread_decorator
);
// Free the control tree, if one was created locally.
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread );
bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread );
#ifdef PRINT_THRINFO
threads[id] = thread;

View File

@@ -60,11 +60,12 @@ struct thrcomm_s
#else
struct thrcomm_s
{
void* sent_object;
dim_t n_threads;
void* sent_object;
dim_t n_threads;
volatile bool_t barrier_sense;
dim_t barrier_threads_arrived;
//volatile bool_t barrier_sense;
bool_t barrier_sense;
dim_t barrier_threads_arrived;
};
#endif

View File

@@ -43,81 +43,84 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads )
return comm;
}
void bli_thrcomm_free( thrcomm_t* communicator )
void bli_thrcomm_free( thrcomm_t* comm )
{
if ( communicator == NULL ) return;
bli_thrcomm_cleanup( communicator );
bli_free_intl( communicator );
if ( comm == NULL ) return;
bli_thrcomm_cleanup( comm );
bli_free_intl( comm );
}
#ifdef BLIS_USE_PTHREAD_BARRIER
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
{
if ( communicator == NULL ) return;
communicator->sent_object = NULL;
communicator->n_threads = n_threads;
pthread_barrier_init( &communicator->barrier, NULL, n_threads );
if ( comm == NULL ) return;
comm->sent_object = NULL;
comm->n_threads = n_threads;
pthread_barrier_init( &comm->barrier, NULL, n_threads );
}
void bli_thrcomm_cleanup( thrcomm_t* communicator )
void bli_thrcomm_cleanup( thrcomm_t* comm )
{
if ( communicator == NULL ) return;
pthread_barrier_destroy( &communicator->barrier );
if ( comm == NULL ) return;
pthread_barrier_destroy( &comm->barrier );
}
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id )
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id )
{
pthread_barrier_wait( &communicator->barrier );
pthread_barrier_wait( &comm->barrier );
}
#else
void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads)
void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads)
{
if ( communicator == NULL ) return;
communicator->sent_object = NULL;
communicator->n_threads = n_threads;
communicator->sense = 0;
communicator->threads_arrived = 0;
if ( comm == NULL ) return;
comm->sent_object = NULL;
comm->n_threads = n_threads;
comm->barrier_sense = 0;
comm->barrier_threads_arrived = 0;
#ifdef BLIS_USE_PTHREAD_MUTEX
pthread_mutex_init( &communicator->mutex, NULL );
#endif
//#ifdef BLIS_USE_PTHREAD_MUTEX
// pthread_mutex_init( &comm->mutex, NULL );
//#endif
}
void bli_thrcomm_cleanup( thrcomm_t* communicator )
void bli_thrcomm_cleanup( thrcomm_t* comm )
{
#ifdef BLIS_USE_PTHREAD_MUTEX
if ( communicator == NULL ) return;
pthread_mutex_destroy( &communicator->mutex );
#endif
//#ifdef BLIS_USE_PTHREAD_MUTEX
// if ( comm == NULL ) return;
// pthread_mutex_destroy( &comm->mutex );
//#endif
}
void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id )
void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id )
{
if ( communicator == NULL || communicator->n_threads == 1 ) return;
bool_t my_sense = communicator->sense;
#if 0
if ( comm == NULL || comm->n_threads == 1 ) return;
bool_t my_sense = comm->sense;
dim_t my_threads_arrived;
#ifdef BLIS_USE_PTHREAD_MUTEX
pthread_mutex_lock( &communicator->mutex );
my_threads_arrived = ++(communicator->threads_arrived);
pthread_mutex_unlock( &communicator->mutex );
pthread_mutex_lock( &comm->mutex );
my_threads_arrived = ++(comm->threads_arrived);
pthread_mutex_unlock( &comm->mutex );
#else
my_threads_arrived = __sync_add_and_fetch(&(communicator->threads_arrived), 1);
my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1);
#endif
if ( my_threads_arrived == communicator->n_threads )
if ( my_threads_arrived == comm->n_threads )
{
communicator->threads_arrived = 0;
communicator->sense = !communicator->sense;
comm->threads_arrived = 0;
comm->sense = !comm->sense;
}
else
{
volatile bool_t* listener = &communicator->sense;
volatile bool_t* listener = &comm->sense;
while( *listener == my_sense ) {}
}
#endif
bli_thrcomm_barrier_atomic( comm, t_id );
}
#endif
@@ -129,6 +132,7 @@ void* bli_l3_thread_entry( void* data_void );
typedef struct thread_data
{
l3int_t func;
opid_t family;
obj_t* alpha;
obj_t* a;
obj_t* b;
@@ -145,6 +149,7 @@ void* bli_l3_thread_entry( void* data_void )
{
thread_data_t* data = data_void;
opid_t family = data->family;
obj_t* alpha = data->alpha;
obj_t* a = data->a;
obj_t* b = data->b;
@@ -159,7 +164,7 @@ void* bli_l3_thread_entry( void* data_void )
thrinfo_t* thread;
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use );
bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread );
@@ -177,7 +182,7 @@ void* bli_l3_thread_entry( void* data_void )
);
// Free the control tree, if one was created locally.
bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread );
bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( thread );
@@ -188,6 +193,7 @@ void* bli_l3_thread_entry( void* data_void )
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
@@ -214,6 +220,7 @@ void bli_l3_thread_decorator
{
// Set up thread data for additional threads (beyond thread 0).
datas[id].func = func;
datas[id].family = family;
datas[id].alpha = alpha;
datas[id].a = a;
datas[id].b = b;

Some files were not shown because too many files have changed in this diff Show More