From 66dbe69a0f9359bf1e39b5672ee365213de2e3ee Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 25 May 2018 15:45:53 -0500 Subject: [PATCH 01/37] Converted macros to static funcs in _packm_cntl.h. Details: - Converted various macros in frame/1m/packm/bli_packm_cntl.h (designed to access fields of a packm_params_t struct) to static functions. --- frame/1m/packm/bli_packm_cntl.h | 56 +++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index ab22e8621..c982aa0b6 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -46,37 +46,45 @@ struct packm_params_s }; typedef struct packm_params_s packm_params_t; -#define bli_cntl_packm_params_var_func( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->var_func ) +static packm_voft bli_cntl_packm_params_var_func( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->var_func; +} -#define bli_cntl_packm_params_bmid_m( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->bmid_m ) +static bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->bmid_m; +} -#define bli_cntl_packm_params_bmid_n( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->bmid_n ) +static bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->bmid_n; +} -#define bli_cntl_packm_params_does_invert_diag( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->does_invert_diag ) +static bool_t bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->does_invert_diag; +} -#define bli_cntl_packm_params_rev_iter_if_upper( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_upper ) +static bool_t bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->rev_iter_if_upper; +} -#define bli_cntl_packm_params_rev_iter_if_lower( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_lower ) +static bool_t bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->rev_iter_if_lower; +} -#define bli_cntl_packm_params_pack_schema( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->pack_schema ) +static pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->pack_schema; +} -#define bli_cntl_packm_params_pack_buf_type( cntl ) \ -\ - ( ( (packm_params_t*)(cntl)->params )->pack_buf_type ) +static packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) +{ + packm_params_t* ppp = cntl->params; return ppp->pack_buf_type; +} // ----------------------------------------------------------------------------- From 469727d4f8a976d8713afb4d0b6235c322498db0 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 25 May 2018 16:17:13 -0500 Subject: [PATCH 02/37] Very minor comment updates. --- frame/base/bli_cntx.c | 4 ++-- frame/ind/oapi/bli_l3_nat_oapi.c | 19 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index db2d73953..a6604ed45 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -934,8 +934,8 @@ void bli_cntx_set_thrloop_from_env if ( l3_op == BLIS_TRMM ) { - // We reconfigure the paralelism from trmm_r due to a dependency in - // the jc loop. (NOTE: This dependency does not exist for trmm3 ) + // We reconfigure the parallelism from trmm_r due to a dependency in + // the jc loop. (NOTE: This dependency does not exist for trmm3.) if ( bli_is_right( side ) ) { bli_cntx_set_thrloop diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 7a8aa3971..c6d3bdb21 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -60,15 +60,18 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Invoke the operation's front end with the appropriate control - tree. */ \ + /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ ( \ alpha, a, b, beta, c, cntx, NULL \ ); \ } +// If a sandbox was enabled, do not define bli_gemmnat() since it will be +// defined in the sandbox environment. +#ifndef BLIS_ENABLE_SANDBOX GENFRONT( gemm, gemm, nat ) +#endif GENFRONT( her2k, gemm, nat ) GENFRONT( syr2k, gemm, nat ) @@ -94,8 +97,7 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Invoke the operation's front end with the appropriate control - tree. */ \ + /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ ( \ side, alpha, a, b, beta, c, cntx, NULL \ @@ -126,8 +128,7 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Invoke the operation's front end with the appropriate control - tree. */ \ + /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ ( \ alpha, a, beta, c, cntx, NULL \ @@ -157,8 +158,7 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Invoke the operation's front end with the appropriate control - tree. */ \ + /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ ( \ side, alpha, a, b, cntx, NULL \ @@ -187,8 +187,7 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Invoke the operation's front end with the appropriate control - tree. */ \ + /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ ( \ side, alpha, a, b, cntx, NULL \ From 850a8a46c0a569a2652d8c200e5c53b61bcf988d Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 29 May 2018 13:51:21 -0500 Subject: [PATCH 03/37] Test all x86_64 configurations*... (#212) * Add custom SDE cpuid files. * Set up testing of all x86_64 architectures (except bulldozer) using SDE. * Update .travis.yml [ci skip] * Update do_testsuite.sh [ci skip] * Updated .travis.yml with my secret token. Details: - Replaced Devin's temporary secret token with my own, which is used by Travis when accessing the Intel SDE via Dropbox. * Work around CPUID dispatch in glibc/libm by patching ld.so. * Detect path of loader at runtime. * Attempt to make SDE run on Travis * Allow unpatched ld.so if we don't know how to patch it. I *think* this only happens for older glibc without the multi-arch stuff (e.g. Ubuntu 14.04 on Travis), but who knows? * Upgrade Travis to gcc-6 and binutils-2.26. * Try to get Travis to use the right assembler. * Apparently you need ld-2.26 too. * Try to also patch ld.so from Ubuntu 14.04. * Take the nuclear option. * Account for non-absolute dependencies in ldd output. * String manipulation fail. * Update patch-ld-so.py * Add Zen to SDE testing. * Removed dead variable from travis/do_testsuite.sh. Details: - Removed 'BLIS_ENABLE_TEST_OUTPUT=yes' from make invocations in travis/do_testsuite.sh. This variable is no longer present in the BLIS build system (if it ever was?), and therefore has no effect. --- .travis.yml | 129 +++++++++++++------------------- config/excavator/make_defs.mk | 4 +- config/piledriver/make_defs.mk | 4 +- config/steamroller/make_defs.mk | 4 +- travis/cpuid/excavator.def | 78 +++++++++++++++++++ travis/cpuid/haswell.def | 65 ++++++++++++++++ travis/cpuid/penryn.def | 52 +++++++++++++ travis/cpuid/piledriver.def | 82 ++++++++++++++++++++ travis/cpuid/sandybridge.def | 55 ++++++++++++++ travis/cpuid/skx.def | 82 ++++++++++++++++++++ travis/cpuid/skx1.def | 82 ++++++++++++++++++++ travis/cpuid/steamroller.def | 80 ++++++++++++++++++++ travis/cpuid/zen.def | 83 ++++++++++++++++++++ travis/do_sde.sh | 47 ++++++++++++ travis/do_testsuite.sh | 15 ++++ travis/patch-ld-so.py | 16 ++++ 16 files changed, 794 insertions(+), 84 deletions(-) create mode 100644 travis/cpuid/excavator.def create mode 100644 travis/cpuid/haswell.def create mode 100644 travis/cpuid/penryn.def create mode 100644 travis/cpuid/piledriver.def create mode 100644 travis/cpuid/sandybridge.def create mode 100644 travis/cpuid/skx.def create mode 100644 travis/cpuid/skx1.def create mode 100644 travis/cpuid/steamroller.def create mode 100644 travis/cpuid/zen.def create mode 100755 travis/do_sde.sh create mode 100755 travis/do_testsuite.sh create mode 100755 travis/patch-ld-so.py diff --git a/.travis.yml b/.travis.yml index d1dcce71d..adcd16abf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,89 +1,62 @@ language: c -sudo: false - -os: - - linux - - osx - -compiler: - - gcc - - clang - +sudo: required +dist: trusty env: - - OOT=1 TEST=0 THR="none" CONF="auto" - - OOT=0 TEST=1 THR="none" CONF="auto" - - OOT=0 TEST=1 THR="none" CONF="penryn" - - OOT=0 TEST=0 THR="none" CONF="sandybridge" - - OOT=0 TEST=0 THR="none" CONF="haswell" - - OOT=0 TEST=0 THR="none" CONF="knl" - - OOT=0 TEST=0 THR="none" CONF="bulldozer" - - OOT=0 TEST=0 THR="none" CONF="piledriver" - - OOT=0 TEST=0 THR="none" CONF="steamroller" - - OOT=0 TEST=0 THR="none" CONF="excavator" - - OOT=0 TEST=0 THR="none" CONF="zen" - - OOT=0 TEST=0 THR="openmp" CONF="auto" - - OOT=0 TEST=0 THR="pthreads" CONF="auto" - + global: + secure: "Ty3PM1xGhXwxfJG6YyY9bUZyXzw98ekHxQEqU9VnrMXTZb28IxfocPCXHjL34r9HTGosO5Pmierhal1Cs3ZKE5ZAJqJhCfck+kwlH21Uay5CNYglDtSmy2qxtbbDG4AxpEZ1UKlIZr1pNh/x+pRemSmnMEnQp/E7QJqdkhm4+aMX2bWKyLPtrdL+B9QXLVT2nT6/Fw3i05aBhpcFJpSPfvYX2KoCZYdJOSKcKci4T8nAfP/c0olkz+jAkBZxZFgO9Ptrt/lvHtVPrkh5o29GvHg2i/4vucbsMltoxlV31/2eYpdr17Ngtt41MMVn2fHV4lVhLmENc04nlm084fBtg73T6b8hNy5JlcA44xI/UrPJsQAJ+0A0ds9BbBQKPxOmaF/O8WGXhwiwdKT6DGS9lj05f3S+yZfeNE3pQhLEcvwXLO5SW3VvKXMj0t/lZyG+XCkvFjD7KEPQV4g+BZc2zzD9TwDx3ydn8Uzd6zZlq1erQUzCnODP24wuwfrNP8nqxFYG0VtI8oZW62IC9U2hcnAF5QNXXW3yDYD65k3BHbigfI28gu9iO9G8RxOglR27J7Whdqkqw3AMRaqyHt2tdbz7tM2dLZ0EatT5m8esjC+LP4EshW9C59jP2U9vJ/94YEgOfwiqk8+e6fL/7dJvOumbwu1RclRI9DS88PPYb3Q=" matrix: - allow_failures: - - env: OOT=0 TEST=0 THR="none" CONF="knl" - - os: osx - env: OOT=0 TEST=1 THR="none" CONF="auto" - exclude: - - os: linux - compiler: clang - - os: osx - compiler: gcc - - os: osx - env: OOT=1 TEST=0 THR="none" CONF="auto" - - os: osx - env: OOT=0 TEST=1 THR="none" CONF="penryn" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="sandybridge" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="haswell" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="knl" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="bulldozer" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="piledriver" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="steamroller" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="excavator" - - os: osx - env: OOT=0 TEST=0 THR="none" CONF="zen" - - os: osx - env: OOT=0 TEST=0 THR="openmp" CONF="auto" - + include: + # full testsuite + - os: linux + compiler: gcc + env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto" + # test x86_64 ukrs with SDE + - os: linux + compiler: gcc + env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64" + # openmp build + - os: linux + compiler: gcc + env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto" + # pthreads build + - os: linux + compiler: gcc + env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto" + # out-of-tree build + - os: linux + compiler: gcc + env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto" + # clang build + - os: linux + compiler: clang + env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto" + # macOS with system compiler (clang) + - os: osx + compiler: clang + env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto" install: - - if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-5"; fi - +- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/as; fi +- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/as /usr/bin/as; fi +- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/ld; fi +- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/ld /usr/bin/ld; fi +- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-6"; fi addons: apt: sources: - ubuntu-toolchain-r-test packages: - - gcc-5 + - gcc-6 + - binutils-2.26 - clang - script: - - export DIST_PATH=. - - pwd - - if [ $OOT -eq 1 ]; then mkdir oot; cd oot; export DIST_PATH=..; fi - - pwd - - $DIST_PATH/configure -t $THR CC=$CC $CONF - - pwd - - ls -l - - $CC --version - - make -j 2 - - export BLIS_IC_NT=2 - - export BLIS_JC_NT=1 - - export BLIS_IR_NT=1 - - export BLIS_JR_NT=1 - - if [ $TEST -eq 1 ]; then travis_wait 30 make BLIS_ENABLE_TEST_OUTPUT=yes testblis; fi - - if [ $TEST -eq 1 ]; then $DIST_PATH/build/check-blistest.sh ./output.testsuite; fi - - if [ $TEST -eq 1 ]; then make BLIS_ENABLE_TEST_OUTPUT=yes testblas; fi - - if [ $TEST -eq 1 ]; then $DIST_PATH/build/check-blastest.sh; fi - +- export DIST_PATH=. +- pwd +- if [ $OOT -eq 1 ]; then mkdir oot; cd oot; export DIST_PATH=..; fi +- pwd +- $DIST_PATH/configure -t $THR CC=$CC $CONF +- pwd +- ls -l +- $CC --version +- make -j 2 +- if [ $TEST -eq 1 ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi +- if [ $SDE -eq 1 ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi diff --git a/config/excavator/make_defs.mk b/config/excavator/make_defs.mk index cebaa30df..12d5add0c 100644 --- a/config/excavator/make_defs.mk +++ b/config/excavator/make_defs.mk @@ -63,10 +63,10 @@ endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 +CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4 else ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 +CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver4 else $(error gcc or clang are required for this configuration.) endif diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 2e58143ec..395b8b9b5 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -63,10 +63,10 @@ endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 +CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2 else ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 +CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver2 else $(error gcc or clang are required for this configuration.) endif diff --git a/config/steamroller/make_defs.mk b/config/steamroller/make_defs.mk index cbd9064cc..adb6ebe2e 100644 --- a/config/steamroller/make_defs.mk +++ b/config/steamroller/make_defs.mk @@ -63,10 +63,10 @@ endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 +CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3 else ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 +CKVECFLAGS := -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3 else $(error gcc or clang are required for this configuration.) endif diff --git a/travis/cpuid/excavator.def b/travis/cpuid/excavator.def new file mode 100644 index 000000000..2479cdd44 --- /dev/null +++ b/travis/cpuid/excavator.def @@ -0,0 +1,78 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD A12-8870, 4000 MHz +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00660F51 00040800 7ED8320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000000 +00000006 ******** => 00000004 00000000 00000001 00000000 +00000007 ******** => 00000000 000001A9 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 000003C0 40000000 +0000000D 00000001 => 00000001 00000000 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +0000000D 0000003E => 00000080 00000340 00000000 00000000 +80000000 ******** => 8000001E 68747541 444D4163 69746E65 +80000001 ******** => 00660F51 20000000 2FABBFFF 2FD3FBFF +80000002 ******** => 20444D41 204F5250 2D323141 30373838 +80000003 ******** => 2C375220 20323120 504D4F43 20455455 +80000004 ******** => 45524F43 43342053 2047382B 00202020 +80000005 ******** => FF40FF18 FF40FF30 20080140 60030140 +80000006 ******** => 64006400 64004200 04008140 00000000 +80000007 ******** => 00000000 00000005 00000400 000037D9 +80000008 ******** => 00003030 00000000 00004003 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00008000 00000000 0001BCFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F018 64006400 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000007FF 00000000 00000000 00000000 +8000001C ******** => 00000000 80032013 00010200 E000000F +8000001E ******** => 00000010 00000100 00000000 00000000 diff --git a/travis/cpuid/haswell.def b/travis/cpuid/haswell.def new file mode 100644 index 000000000..7bbd0c1b5 --- /dev/null +++ b/travis/cpuid/haswell.def @@ -0,0 +1,65 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: Intel Xeon E5-2660 v3, 2600 MHz +# +00000000 ******** => 0000000F 756E6547 6C65746E 49656E69 +00000001 ******** => 000306F2 00200800 7FFEFBFF BFEBFBFF +00000002 ******** => 76036301 00F0B5FF 00000000 00C10000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000004 00000000 => 3C004121 01C0003F 0000003F 00000000 +00000004 00000001 => 3C004122 01C0003F 0000003F 00000000 +00000004 00000002 => 3C004143 01C0003F 000001FF 00000000 +00000004 00000003 => 3C07C163 04C0003F 00004FFF 00000006 +00000005 ******** => 00000040 00000040 00000003 00002120 +00000006 ******** => 00000075 00000002 00000009 00000000 +00000007 ******** => 00000000 000037AB 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000001 00000000 00000000 00000000 +0000000A ******** => 07300403 00000000 00000000 00000603 +0000000B 00000000 => 00000001 00000002 00000100 00000000 +0000000B 00000001 => 00000005 00000014 00000201 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 00000340 00000000 +0000000D 00000001 => 00000001 00000000 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +0000000E ******** => 00000000 00000000 00000000 00000000 +0000000F 00000000 => 00000000 00000027 00000000 00000002 +0000000F 00000001 => 00000000 0000A000 00000027 00000001 +80000000 ******** => 80000008 00000000 00000000 00000000 +80000001 ******** => 00000000 00000000 00000021 2C100000 +80000002 ******** => 65746E49 2952286C 6F655820 2952286E +80000003 ******** => 55504320 2D354520 30363632 20337620 +80000004 ******** => 2E322040 48473036 0000007A 00000000 +80000005 ******** => 00000000 00000000 00000000 00000000 +80000006 ******** => 00000000 00000000 01006040 00000000 +80000007 ******** => 00000000 00000000 00000000 00000100 +80000008 ******** => 0000302E 00000000 00000000 00000000 diff --git a/travis/cpuid/penryn.def b/travis/cpuid/penryn.def new file mode 100644 index 000000000..3f12be1af --- /dev/null +++ b/travis/cpuid/penryn.def @@ -0,0 +1,52 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: Intel Xeon X5550, 2666 MHz +# +00000000 ******** => 0000000B 756E6547 6C65746E 49656E69 +00000001 ******** => 000106A2 00100800 00BCE3BD BFEBFBFF +00000002 ******** => 55035A01 00F0B2E4 00000000 09CA212C +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00021120 +00000006 ******** => 00000003 00000002 00000001 00000000 +00000007 ******** => 00000000 00000000 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 07300403 00000000 00000000 00000603 +80000000 ******** => 80000008 00000000 00000000 00000000 +80000001 ******** => 00000000 00000000 00000001 28100000 +80000002 ******** => 756E6547 20656E69 65746E49 2952286C +80000003 ******** => 55504320 20202020 20202020 40202020 +80000004 ******** => 30303020 20402030 37362E32 007A4847 +80000005 ******** => 00000000 00000000 00000000 00000000 +80000006 ******** => 00000000 00000000 01006040 00000000 +80000007 ******** => 00000000 00000000 00000000 00000100 +80000008 ******** => 00003028 00000000 00000000 00000000 diff --git a/travis/cpuid/piledriver.def b/travis/cpuid/piledriver.def new file mode 100644 index 000000000..06c64b1c1 --- /dev/null +++ b/travis/cpuid/piledriver.def @@ -0,0 +1,82 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD A10-6800K, 4300 MHz +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00610F31 00040800 3E98320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000000 +00000006 ******** => 00000000 00000000 00000001 00000000 +00000007 ******** => 00000000 00000008 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 000003C0 40000000 +0000000D 00000001 => 00000000 00000000 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +0000000D 0000003E => 00000080 00000340 00000000 00000000 +80000000 ******** => 8000001E 68747541 444D4163 69746E65 +80000001 ******** => 00610F31 20000000 01EBBFFF 2FD3FBFF +80000002 ******** => 20444D41 2D303141 30303836 5041204B +80000003 ******** => 69772055 52206874 6F656461 6D74286E +80000004 ******** => 44482029 61724720 63696870 00202073 +80000005 ******** => FF40FF18 FF40FF30 10040140 40020140 +80000006 ******** => 64006400 64004200 08008140 00000000 +80000007 ******** => 00000000 00000000 00000000 000007D9 +80000008 ******** => 00003030 00000000 00004003 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00010000 00000000 00001CFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F018 64006400 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000000FF 00000000 00000000 00000000 +8000001C ******** => 00000000 80032013 00010200 8000000F +8000001D 00000001 => 00000121 00C0003F 0000003F 00000000 +8000001D 00000002 => 00004122 0040003F 000001FF 00000000 +8000001D 00000003 => 00004143 03C0003F 000007FF 00000001 +8000001E ******** => 00000010 00000100 00000000 00000000 +8FFFFFFF ******** => 00000000 00000000 00000000 00000000 diff --git a/travis/cpuid/sandybridge.def b/travis/cpuid/sandybridge.def new file mode 100644 index 000000000..7faf93b9f --- /dev/null +++ b/travis/cpuid/sandybridge.def @@ -0,0 +1,55 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: Intel Xeon E3-1230 v2, 3700 MHz +# +00000000 ******** => 0000000D 756E6547 6C65746E 49656E69 +00000001 ******** => 000306A9 00100800 7FBAE3FF BFEBFBFF +00000002 ******** => 76035A01 00F0B2FF 00000000 00CA0000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00001120 +00000006 ******** => 00000077 00000002 00000009 00000000 +00000007 ******** => 00000000 00000281 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 07300403 00000000 00000000 00000603 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000240 00000340 00000000 +0000000D 00000001 => 00000100 00000240 00000000 00000000 +80000000 ******** => 80000008 00000000 00000000 00000000 +80000001 ******** => 00000000 00000000 00000001 28100000 +80000002 ******** => 20202020 6E492020 286C6574 58202952 +80000003 ******** => 286E6F65 43202952 45205550 32312D33 +80000004 ******** => 56203033 20402032 30332E33 007A4847 +80000005 ******** => 00000000 00000000 00000000 00000000 +80000006 ******** => 00000000 00000000 01006040 00000000 +80000007 ******** => 00000000 00000000 00000000 00000100 +80000008 ******** => 00003024 00000000 00000000 00000000 diff --git a/travis/cpuid/skx.def b/travis/cpuid/skx.def new file mode 100644 index 000000000..adff98e3f --- /dev/null +++ b/travis/cpuid/skx.def @@ -0,0 +1,82 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: Intel Xeon Platinum 8180, 2500 MHz +# +00000000 ******** => 00000016 756E6547 6C65746E 49656E69 +00000001 ******** => 00050654 00400800 7FFEFBFF BFEBFBFF +00000002 ******** => 76036301 00F0B5FF 00000000 00C30000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000004 00000000 => 7C004121 01C0003F 0000003F 00000000 +00000004 00000001 => 7C004122 01C0003F 0000003F 00000000 +00000004 00000002 => 7C004143 03C0003F 000003FF 00000000 +00000004 00000003 => 7C0FC163 0280003F 0000DFFF 00000004 +00000005 ******** => 00000040 00000040 00000003 00002020 +00000006 ******** => 00000EF7 00000002 00000009 00000000 +00000007 ******** => 00000000 D39FFFFB 00000008 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 07300404 00000000 00000000 00000603 +0000000B 00000000 => 00000001 00000002 00000100 00000000 +0000000B 00000001 => 00000006 00000038 00000201 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 000002FF 00000A80 00000A88 00000000 +0000000D 00000001 => 0000000F 00000A00 00000100 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +0000000D 00000003 => 00000040 000003C0 00000000 00000000 +0000000D 00000004 => 00000040 00000400 00000000 00000000 +0000000D 00000005 => 00000040 00000440 00000000 00000000 +0000000D 00000006 => 00000200 00000480 00000000 00000000 +0000000D 00000007 => 00000400 00000680 00000000 00000000 +0000000D 00000008 => 00000080 00000000 00000001 00000000 +0000000D 00000009 => 00000008 00000A80 00000000 00000000 +0000000E ******** => 00000000 00000000 00000000 00000000 +0000000F 00000000 => 00000000 000000DF 00000000 00000002 +0000000F 00000001 => 00000000 0001C000 000000DF 00000007 +00000010 00000000 => 00000000 0000000A 00000000 00000000 +00000010 00000001 => 0000000A 00000600 00000004 0000000F +00000011 ******** => 00000000 00000000 00000000 00000000 +00000012 00000000 => 00000000 00000000 00000000 00000000 +00000012 00000001 => 00000000 00000000 00000000 00000000 +00000013 ******** => 00000000 00000000 00000000 00000000 +00000014 00000000 => 00000001 0000000F 00000007 00000000 +00000014 00000001 => 02490002 003F3FFF 00000000 00000000 +00000015 ******** => 00000002 000000C8 00000000 00000000 +00000016 ******** => 000009C4 00000ED8 00000064 00000000 +80000000 ******** => 80000008 00000000 00000000 00000000 +80000001 ******** => 00000000 00000000 00000121 2C100000 +80000002 ******** => 65746E49 2952286C 6F655820 2952286E +80000003 ******** => 616C5020 756E6974 3138206D 43203038 +80000004 ******** => 40205550 352E3220 7A484730 00000000 +80000005 ******** => 00000000 00000000 00000000 00000000 +80000006 ******** => 00000000 00000000 01006040 00000000 +80000007 ******** => 00000000 00000000 00000000 00000100 +80000008 ******** => 0000302E 00000000 00000000 00000000 diff --git a/travis/cpuid/skx1.def b/travis/cpuid/skx1.def new file mode 100644 index 000000000..d28b69432 --- /dev/null +++ b/travis/cpuid/skx1.def @@ -0,0 +1,82 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: Intel Xeon Bronze 3106 +# +00000000 ******** => 00000016 756E6547 6C65746E 49656E69 +00000001 ******** => 00050654 00100800 7FFEFBFF BFEBFBFF +00000002 ******** => 76036301 00F0B6FF 00000000 00C30000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000004 00000000 => 1C004121 01C0003F 0000003F 00000000 +00000004 00000001 => 1C004122 01C0003F 0000003F 00000000 +00000004 00000002 => 1C004143 03C0003F 000003FF 00000000 +00000004 00000003 => 1C03C163 0280003F 00002FFF 00000004 +00000005 ******** => 00000040 00000040 00000003 00002020 +00000006 ******** => 00000EF5 00000002 00000009 00000000 +00000007 ******** => 00000000 D39FFFFB 00000008 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 07300804 00000000 00000000 00000603 +0000000B 00000000 => 00000001 00000001 00000100 00000000 +0000000B 00000001 => 00000004 00000006 00000201 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 000002FF 00000340 00000A88 00000000 +0000000D 00000001 => 0000000F 00000340 00000100 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +0000000D 00000003 => 00000040 000003C0 00000000 00000000 +0000000D 00000004 => 00000040 00000400 00000000 00000000 +0000000D 00000005 => 00000040 00000440 00000000 00000000 +0000000D 00000006 => 00000200 00000480 00000000 00000000 +0000000D 00000007 => 00000400 00000680 00000000 00000000 +0000000D 00000008 => 00000080 00000000 00000001 00000000 +0000000D 00000009 => 00000008 00000A80 00000000 00000000 +0000000E ******** => 00000000 00000000 00000000 00000000 +0000000F 00000000 => 00000000 0000002F 00000000 00000002 +0000000F 00000001 => 00000000 00006000 0000002F 00000007 +00000010 00000000 => 00000000 0000000A 00000000 00000000 +00000010 00000001 => 0000000A 00000600 00000004 0000000F +00000011 ******** => 00000000 00000000 00000000 00000000 +00000012 00000000 => 00000000 00000000 00000000 00000000 +00000012 00000001 => 00000000 00000000 00000000 00000000 +00000013 ******** => 00000000 00000000 00000000 00000000 +00000014 00000000 => 00000001 0000000F 00000007 00000000 +00000014 00000001 => 02490002 003F3FFF 00000000 00000000 +00000015 ******** => 00000002 00000088 00000000 00000000 +00000016 ******** => 000006A4 000006A4 00000064 00000000 +80000000 ******** => 80000008 00000000 00000000 00000000 +80000001 ******** => 00000000 00000000 00000121 2C100000 +80000002 ******** => 65746E49 2952286C 6F655820 2952286E +80000003 ******** => 6F724220 20657A6E 34303133 55504320 +80000004 ******** => 31204020 4730372E 00007A48 00000000 +80000005 ******** => 00000000 00000000 00000000 00000000 +80000006 ******** => 00000000 00000000 01006040 00000000 +80000007 ******** => 00000000 00000000 00000000 00000100 +80000008 ******** => 0000302E 00000000 00000000 00000000 diff --git a/travis/cpuid/steamroller.def b/travis/cpuid/steamroller.def new file mode 100644 index 000000000..c56d6104a --- /dev/null +++ b/travis/cpuid/steamroller.def @@ -0,0 +1,80 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD A10-7850K, 4000 MHz +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00630F01 00040800 3E98320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000000 +00000006 ******** => 00000000 00000000 00000001 00000000 +00000007 ******** => 00000000 00000009 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 000003C0 40000000 +0000000D 00000001 => 00000100 00000240 00000000 00000000 +80000000 ******** => 8000001E 68747541 444D4163 69746E65 +80000001 ******** => 00630F01 10000000 0FEBBFFF 2FD3FBFF +80000002 ******** => 20444D41 2D303141 30353837 5041204B +80000003 ******** => 69772055 52206874 6F656461 4D54286E +80000004 ******** => 37522029 61724720 63696870 00202073 +80000005 ******** => FF40FF18 FF40FF30 10040140 60030140 +80000006 ******** => 64006400 64004200 08008140 00000000 +80000007 ******** => 00000000 00000001 00000000 000027D9 +80000008 ******** => 00003030 00000000 00004003 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00010000 00000000 00001CFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F018 64006400 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000001FF 00000000 00000000 00000000 +8000001C ******** => 00000000 80032013 00010200 E000000F +8000001D 00000000 => 00000121 00C0003F 0000003F 00000000 +8000001D 00000001 => 00004122 0080003F 000001FF 00000000 +8000001D 00000002 => 00004143 03C0003F 000007FF 00000001 +8000001E ******** => 00000010 00000100 00000000 00000000 +8FFFFFFF ******** => 00000000 00000000 00000000 00000000 diff --git a/travis/cpuid/zen.def b/travis/cpuid/zen.def new file mode 100644 index 000000000..2c20714fb --- /dev/null +++ b/travis/cpuid/zen.def @@ -0,0 +1,83 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD EPYC 7551P, 3000 MHz +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00800F12 00400800 7ED8320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000011 +00000006 ******** => 00000004 00000000 00000001 00000000 +00000007 ******** => 00000000 209C01A9 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 00000340 00000000 +0000000D 00000001 => 0000000F 00000340 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +80000000 ******** => 8000001F 68747541 444D4163 69746E65 +80000001 ******** => 00800F12 40000000 35C233FF 2FD3FBFF +80000002 ******** => 20444D41 43595045 35353720 33205031 +80000003 ******** => 6F432D32 50206572 65636F72 726F7373 +80000004 ******** => 20202020 20202020 20202020 00202020 +80000005 ******** => FF40FF40 FF40FF40 20080140 40040140 +80000006 ******** => 36006400 56006400 02006140 0200C140 +80000007 ******** => 00000000 0000001B 00000000 00006799 +80000008 ******** => 00003030 00000007 0000603F 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00008000 00000000 0001BCFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F040 00000000 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000003FF 00000000 00000000 00000000 +8000001C ******** => 00000000 00000000 00000000 00000000 +8000001D 00000000 => 00004121 01C0003F 0000003F 00000000 +8000001D 00000001 => 00004122 00C0003F 000000FF 00000000 +8000001D 00000002 => 00004143 01C0003F 000003FF 00000002 +8000001D 00000003 => 0001C163 03C0003F 00001FFF 00000001 +8000001E ******** => 00000000 00000100 00000300 00000000 +8000001F ******** => 0000000F 0000016F 0000000F 00000001 +8FFFFFFF ******** => 00000000 00000000 00000000 00000000 diff --git a/travis/do_sde.sh b/travis/do_sde.sh new file mode 100755 index 000000000..3c13d6be1 --- /dev/null +++ b/travis/do_sde.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +set -e +set -x + +SDE_VERSION=sde-external-8.16.0-2018-01-30-lin +SDE_TARBALL=$SDE_VERSION.tar.bz2 +SDE=$SDE_VERSION/sde64 + +set +x +curl -s -X POST https://content.dropboxapi.com/2/files/download -H "Authorization: Bearer $DROPBOX_TOKEN" -H "Dropbox-API-Arg: {\"path\": \"/$SDE_TARBALL\"}" > $SDE_TARBALL +set -x +tar xvf $SDE_TARBALL + +make -j2 testsuite-bin +cp $DIST_PATH/testsuite/input.general.fast input.general +cp $DIST_PATH/testsuite/input.operations.fast input.operations + +TMP=`ldd ./test_libblis.x | grep ld | sed 's/^.*=> //'` +LD_SO=${TMP%% *} +TMP=`ldd ./test_libblis.x | grep libc | sed 's/^.*=> //'` +LIBC_SO=${TMP%% *} +TMP=`ldd ./test_libblis.x | grep libm | sed 's/^.*=> //'` +LIBM_SO=${TMP%% *} +for LIB in $LD_SO $LIBC_SO $LIBM_SO; do + $DIST_PATH/travis/patch-ld-so.py $LIB .tmp + chmod a+x .tmp + sudo mv .tmp $LIB +done + +for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do + if [ "$ARCH" = "knl" ]; then + $SDE -knl -- ./test_libblis.x > output.testsuite + else + $SDE -cpuid_in $DIST_PATH/travis/cpuid/$ARCH.def -- ./test_libblis.x > output.testsuite + fi + $DIST_PATH/build/check-blistest.sh ./output.testsuite + TMP=`grep "active sub-configuration" output.testsuite` + CONFIG=${TMP##* } + if [ "$CONFIG" != "$ARCH" ]; then + echo "Wrong configuration chosen:" + echo " Expected: $ARCH" + echo " Got: $CONFIG" + exit 1 + fi +done + diff --git a/travis/do_testsuite.sh b/travis/do_testsuite.sh new file mode 100755 index 000000000..0fa8341ec --- /dev/null +++ b/travis/do_testsuite.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -e +set -x + +export BLIS_IC_NT=2 +export BLIS_JC_NT=1 +export BLIS_IR_NT=1 +export BLIS_JR_NT=1 + +make testblis +$DIST_PATH/build/check-blistest.sh ./output.testsuite +make testblas +$DIST_PATH/build/check-blastest.sh + diff --git a/travis/patch-ld-so.py b/travis/patch-ld-so.py new file mode 100755 index 000000000..72e580d74 --- /dev/null +++ b/travis/patch-ld-so.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +# +# Patch ld.so to disable runtime CPUID detection +# Taken from https://stackoverflow.com/a/44483482 +# + +import re +import sys + +infile, outfile = sys.argv[1:] +d = open(infile, 'rb').read() +# Match CPUID(eax=0), "xor eax,eax" followed closely by "cpuid" +o = re.sub(b'(\x31\xc0.{0,32})\x0f\xa2', b'\\1\x66\x90', d) +#assert d != o +open(outfile, 'wb').write(o) From 34f974d1a83a7d29ba09f67e392d361231fdf99c Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 29 May 2018 17:11:52 -0500 Subject: [PATCH 04/37] More tweaks/updates to sandbox/ref99/README.md. --- sandbox/ref99/README.md | 73 ++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/sandbox/ref99/README.md b/sandbox/ref99/README.md index 63ff433c0..5449fe1e1 100644 --- a/sandbox/ref99/README.md +++ b/sandbox/ref99/README.md @@ -12,9 +12,9 @@ when you think of implementing the gemm operation: a series of loops around an optimized (usually assembly-based) microkernel with some packing functions thrown in at various levels.) -Why sandboxes? Sometimes, you just want to experiment with tweaks or changes -to the gemm operation, but you want to do so in a simple environment rather -than the somewhat obfuscated and highly macroized and refactored code of the +Why sandboxes? Sometimes you want to experiment with tweaks or changes to +the gemm operation, but you want to do so in a simple environment rather than +the highly macroized and refactored (and somewhat obfuscated) code of the core framework (which, I will remind everyone, is highly macroized and refactored mostly so that all floating-point datatypes and all level-3 operations are supported with minimal source code). By building a BLIS sandbox, @@ -56,16 +56,16 @@ implementation. Like any decent sandbox, there are rules for playing here. Please follow these guidelines for the best sandbox developer experience. -0. Don't bother worrying about makefiles. We've already taken care of the +1. Don't bother worrying about makefiles. We've already taken care of the boring/annoying/headache-inducing build system stuff for you. :) By configuring -BLIS with a sandbox enabled, `make` will scan your directory and compile all -of its source code using similar compilation rules as were used for the rest +BLIS with a sandbox enabled, `make` will scan your sandbox directory and compile +all of its source code using similar compilation rules as were used for the rest of the framework. In addition, the compilation command line will automatically contain one `-I` option for every subdirectory in your sandbox, so it doesn't matter where in your sandbox you place your header files. They will be found! -1. Your sandbox must be written in C99 or C++11. If you write your sandbox in +2. Your sandbox must be written in C99 or C++11. If you write your sandbox in C++11, you must use one of the BLIS-approved file extensions for your source files (`.cc`, `.cpp`, `.cxx`) and your header files (`.hh`, `.hpp`, `.hxx`). Note that `blis.h` @@ -73,13 +73,13 @@ already contains all of its definitions inside of an `extern "C"` block, so you should be able to `#include "blis.h"` from your C++11 source code without any issues. -2. All of your code to replace BLIS's default implementation of `bli_gemmnat()` +3. All of your code to replace BLIS's default implementation of `bli_gemmnat()` should reside in the named sandbox directory, or some directory therein. (Obviously.) For example, this `README.md` file is located in the `ref99` sandbox, located in `sandbox/ref99`. All of the code associated with this sandbox will be contained within `sandbox/ref99`. -3. The *only* header file that is required of your sandbox is `bli_sandbox.h`. +4. The *only* header file that is required of your sandbox is `bli_sandbox.h`. It must be named `bli_sandbox.h` because `blis.h` will `#include` this file when the sandbox is enabled at configure-time. That said, you will probably want to keep the file empty. Why require a file that is supposed to be empty? @@ -93,7 +93,7 @@ Usually, neither of these situations will require any of your local definitions since those definitions are only needed to define your sandbox implementation of `bli_gemmnat()`, and this function is already prototyped by BLIS. -4. Your definition of `bli_gemmnat()` should be the *only* function you define +5. Your definition of `bli_gemmnat()` should be the *only* function you define in your sandbox that begins with `bli_`. If you define other functions that begin with `bli_`, you risk a namespace collision with existing framework functions. To guarantee safety, please prefix your locally-defined sandbox @@ -113,32 +113,51 @@ working with the existing BLIS infrastructure. For example, with a BLIS sandbox you **can** do the following kinds of things: - use a different gemm algorithmic partitioning path than the default Goto-like algorithm; -- experiment with different implementations of `packm` kernels; +- experiment with different implementations of `packm` (not just `packm` +kernels, which can already be customized within each sub-configuration); - try inlining your functions manually; - pivot away from using `obj_t` objects at higher algorithmic level (such as immediately after calling `bli_gemmnat()`) to try to avoid some overhead; +- create experimental implementations of new BLAS-like operations (provided +that you also provide an implementation of `blis_gemmnat()`). -You **cannot**, however, do the following kinds of things: +You **cannot**, however, use a sandbox to do the following kinds of things: - define new datatypes (half-precision, quad-precision, short integer, etc.) and expect the rest of BLIS to "know" how to handle them; -- use a sandbox to implement a different level-3 operation, such as Hermitian -rank-k update; -- define a new BLAS-like operation. +- use a sandbox to replace the default implementation of a different level-3 + operation, such as Hermitian rank-k update; +- change the existing BLIS APIs. Another important limitation is the fact that the build system currently uses "framework `CFLAGS`" when compiling the sandbox source files. These are the same -`CFLAGS` used when compiling general framework source code, which are likely -more general-purpose than the `CFLAGS` used for, say, optimized kernels or even -reference kernels. (To see precisely which flags are being employed for any -given file, enable verbosity at compile-time via `make V=1`.) Compiling -sandboxes with these more versatile `CFLAGS` compiler options means that we -only need to compile one instance of each sandbox source file, even when -targeting multiple configurations (for example, via `./configure x86_64`). -However, it also means that sandboxes are not ideal for microkernels, as they -usually need additional compiler flags not included in the set used for -framework `CFLAGS` in order to yield the highest performance. If you have a -new microkernel you would like to use within a sandbox, it's best to formally -register it along with a new configuration, which will allow you to specify +`CFLAGS` used when compiling general framework source code, +``` +# Example framework CFLAGS used by 'haswell' sub-configuration +-O3 -Wall -Wno-unused-function -Wfatal-errors -fPIC -std=c99 +-D_POSIX_C_SOURCE=200112L -I./include/haswell -I./frame/3/ +-I./frame/ind/ukernels/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/ +-I./frame/include -DBLIS_VERSION_STRING=\"0.3.2-51\" +``` +which are likely more general-purpose than the `CFLAGS` used for, say, +optimized kernels or even reference kernels. +``` +# Example optimized kernel CFLAGS used by 'haswell' sub-configuration +-O3 -mavx2 -mfma -mfpmath=sse -march=core-avx2 -Wall -Wno-unused-function +-Wfatal-errors -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L -I./include/haswell +-I./frame/3/ -I./frame/ind/ukernels/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/ +-I./frame/include -DBLIS_VERSION_STRING=\"0.3.2-51\" +``` +(To see precisely which flags are being employed for any given file, enable +verbosity at compile-time via `make V=1`.) Compiling sandboxes with these more +versatile `CFLAGS` compiler options means that we only need to compile one +instance of each sandbox source file, even when targeting multiple +configurations (for example, via `./configure x86_64`). However, it also means +that sandboxes are not ideal for microkernels, as they sometimes need additional +compiler flags not included in the set used for framework `CFLAGS` in order to +yield the highest performance. If you have a new microkernel you would like to +use within a sandbox, you can always prototype it within a sandbox. However, +once it is stable and ready for use by others, it's best to formally register +the kernel(s) along with a new configuration, which will allow you to specify kernel-specific compiler flags to be used when compiling your microkernel. Please see the [Configuration wiki](https://github.com/flame/blis/wiki/ConfigurationHowTo) From 9588625c43c86ef1bde8140f620a30f52420e6a6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 30 May 2018 15:19:53 -0500 Subject: [PATCH 05/37] Renamed "next micropanel" macros in _l3_thrinfo.h. Details: - Renamed several macros defined in bli_l3_thrinfo.h designed to compute the values of a_next and b_next to insert into an auxinfo_t struct in level-3 macrokernels. (Previously, the macros did not use a bli_ prefix.) - Updated instances of above macro usage within various macrokernels. --- frame/3/bli_l3_thrinfo.h | 18 +++++++++--------- frame/3/gemm/bli_gemm_ker_var2.c | 4 ++-- frame/3/gemm/ind/bli_gemm4mb_ker_var2.c | 4 ++-- frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c | 4 ++-- frame/3/herk/bli_herk_l_ker_var2.c | 4 ++-- frame/3/herk/bli_herk_u_ker_var2.c | 4 ++-- frame/3/trmm/bli_trmm_ll_ker_var2.c | 6 +++--- frame/3/trmm/bli_trmm_lu_ker_var2.c | 6 +++--- frame/3/trmm/bli_trmm_rl_ker_var2.c | 8 ++++---- frame/3/trmm/bli_trmm_ru_ker_var2.c | 8 ++++---- frame/3/trsm/bli_trsm_ll_ker_var2.c | 2 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 2 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 4 ++-- frame/3/trsm/bli_trsm_ru_ker_var2.c | 4 ++-- 14 files changed, 39 insertions(+), 39 deletions(-) diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index fcf1f507d..58733bcf5 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -38,24 +38,24 @@ // gemm -#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) +#define bli_gemm_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define bli_gemm_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way ) // herk -#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) +#define bli_herk_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way ) +#define bli_herk_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way ) // trmm -#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm -#define trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +#define bli_trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 3f54ef031..2e05deaad 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -295,11 +295,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \ + a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \ + b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index c57144392..7cb809904 100644 --- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -291,11 +291,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \ + a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \ + b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c b/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c index 9ecb4cf5a..d711fd946 100644 --- a/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c +++ b/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c @@ -300,11 +300,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \ + a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \ + b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 05a0e300e..3ee105140 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -312,11 +312,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ + a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ + b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 51600d839..b58f600e5 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -312,11 +312,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \ + a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \ + b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 94dd233b0..8d5c4d0f0 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -322,7 +322,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ - if ( trmm_l_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -364,7 +364,7 @@ void PASTEMAC(ch,varname) \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ - if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ \ @@ -434,7 +434,7 @@ void PASTEMAC(ch,varname) \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ - if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 924e75d4f..e54d7d582 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -329,7 +329,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ - if ( trmm_l_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ @@ -371,7 +371,7 @@ void PASTEMAC(ch,varname) \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ - if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ \ @@ -441,7 +441,7 @@ void PASTEMAC(ch,varname) \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ - if ( trmm_l_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 26ef9a13e..919eab1a3 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -361,7 +361,7 @@ void PASTEMAC(ch,varname) \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ - if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -370,7 +370,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ @@ -446,7 +446,7 @@ void PASTEMAC(ch,varname) \ } \ else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ { \ - if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -455,7 +455,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 4c9af0757..bc4907772 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -361,7 +361,7 @@ void PASTEMAC(ch,varname) \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ - if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -370,7 +370,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ @@ -446,7 +446,7 @@ void PASTEMAC(ch,varname) \ } \ else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ { \ - if ( trmm_r_jr_my_iter( j, jr_thread ) ) { \ + if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ @@ -455,7 +455,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \ + if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 17c4b6d7a..bf7f647de 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -340,7 +340,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ - if( trsm_my_iter( j, thread ) ) { \ + if( bli_trsm_my_iter( j, thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index f12bbb194..4b3c4c4b3 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -348,7 +348,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ - if( trsm_my_iter( j, thread ) ) { \ + if( bli_trsm_my_iter( j, thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index be7904936..7f4b93bd3 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -422,7 +422,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( trsm_my_iter( i, thread ) ){ \ + if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a11; \ ctype* restrict a12; \ @@ -508,7 +508,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( trsm_my_iter( i, thread ) ){ \ + if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a2; \ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index e1f2694b0..d91b4b0d0 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -415,7 +415,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( trsm_my_iter( i, thread ) ){ \ + if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a10; \ ctype* restrict a11; \ @@ -501,7 +501,7 @@ void PASTEMAC(ch,varname) \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ - if( trsm_my_iter( i, thread ) ){ \ + if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a2; \ \ From 8749fa0b48a7710f4115023e2c46bc80167bc8f9 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 31 May 2018 12:34:01 -0500 Subject: [PATCH 06/37] Cleanups to ref99/README.md, test/3m4m/Makefile. Details: - Minor edits to sandbox/ref99/README.md. - Removed cpp guards in sandbox/ref99/thread/blx_gemm_thread.h to be consistent with other headers in sandbox/ref99. - Additional targets and related cleanups in test/3m4m/Makefile. --- sandbox/ref99/README.md | 14 ++++--- sandbox/ref99/thread/blx_gemm_thread.h | 4 -- test/3m4m/Makefile | 56 +++++++++++++++++--------- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/sandbox/ref99/README.md b/sandbox/ref99/README.md index 5449fe1e1..fd1f3ae75 100644 --- a/sandbox/ref99/README.md +++ b/sandbox/ref99/README.md @@ -112,21 +112,23 @@ working with the existing BLIS infrastructure. For example, with a BLIS sandbox you **can** do the following kinds of things: - use a different gemm algorithmic partitioning path than the default Goto-like -algorithm; + algorithm; - experiment with different implementations of `packm` (not just `packm` -kernels, which can already be customized within each sub-configuration); + kernels, which can already be customized within each sub-configuration); - try inlining your functions manually; - pivot away from using `obj_t` objects at higher algorithmic level (such as -immediately after calling `bli_gemmnat()`) to try to avoid some overhead; + immediately after calling `bli_gemmnat()`) to try to avoid some overhead; - create experimental implementations of new BLAS-like operations (provided -that you also provide an implementation of `blis_gemmnat()`). + that you also provide an implementation of `blis_gemmnat()`). You **cannot**, however, use a sandbox to do the following kinds of things: - define new datatypes (half-precision, quad-precision, short integer, etc.) -and expect the rest of BLIS to "know" how to handle them; + and expect the rest of BLIS to "know" how to handle them; - use a sandbox to replace the default implementation of a different level-3 operation, such as Hermitian rank-k update; -- change the existing BLIS APIs. +- change the existing BLIS APIs; +- remove support for one or more BLIS datatypes (to cut down on library size, + for example). Another important limitation is the fact that the build system currently uses "framework `CFLAGS`" when compiling the sandbox source files. These are the same diff --git a/sandbox/ref99/thread/blx_gemm_thread.h b/sandbox/ref99/thread/blx_gemm_thread.h index 265d53e1f..903f590f1 100644 --- a/sandbox/ref99/thread/blx_gemm_thread.h +++ b/sandbox/ref99/thread/blx_gemm_thread.h @@ -32,9 +32,6 @@ */ -#ifndef BLIS_GEMM_THREAD_H -#define BLIS_GEMM_THREAD_H - // gemm internal function type typedef void (*gemmint_t) ( @@ -57,4 +54,3 @@ void blx_gemm_thread cntl_t* cntl ); -#endif diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index e5aa84dfa..ca0ac7721 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -214,22 +214,45 @@ PDEF_MT := -DP_BEGIN=200 \ # --- Targets/rules ------------------------------------------------------------ # -all-st: blis-st openblas-st mkl-st -all-mt: blis-mt openblas-mt mkl-mt +all: all-st all-mt +blis: blis-st blis-mt +blis-nat: blis-nat-st blis-nat-mt +openblas: openblas-st openblas-mt +mkl: mkl-st mkl-mt -blis-st: blis-gemm-st -blis-mt: blis-gemm-mt +all-st: blis-st openblas-st mkl-st +all-mt: blis-mt openblas-mt mkl-mt -openblas-st: openblas-gemm-st -openblas-mt: openblas-gemm-mt +blis-st: blis-gemm-st +blis-mt: blis-gemm-mt -mkl-st: mkl-gemm-st -mkl-mt: mkl-gemm-mt +blis-nat-st: blis-gemm-nat-st +blis-nat-mt: blis-gemm-nat-mt -blis-gemm-st: \ +openblas-st: openblas-gemm-st +openblas-mt: openblas-gemm-mt + +mkl-st: mkl-gemm-st +mkl-mt: mkl-gemm-mt + +blis-gemm-st: blis-gemm-nat-st \ + blis-gemm-ind-st +blis-gemm-mt: blis-gemm-nat-mt \ + blis-gemm-ind-mt + +blis-gemm-nat-st: \ test_sgemm_asm_blis_st.x \ test_dgemm_asm_blis_st.x \ - \ + test_cgemm_asm_blis_st.x \ + test_zgemm_asm_blis_st.x + +blis-gemm-nat-mt: \ + test_sgemm_asm_blis_mt.x \ + test_dgemm_asm_blis_mt.x \ + test_cgemm_asm_blis_mt.x \ + test_zgemm_asm_blis_mt.x + +blis-gemm-ind-st: \ test_cgemm_3mhw_blis_st.x \ test_zgemm_3mhw_blis_st.x \ test_cgemm_3m1_blis_st.x \ @@ -241,14 +264,9 @@ blis-gemm-st: \ test_cgemm_4m1a_blis_st.x \ test_zgemm_4m1a_blis_st.x \ test_cgemm_1m_blis_st.x \ - test_zgemm_1m_blis_st.x \ - test_cgemm_asm_blis_st.x \ - test_zgemm_asm_blis_st.x + test_zgemm_1m_blis_st.x -blis-gemm-mt: \ - test_sgemm_asm_blis_mt.x \ - test_dgemm_asm_blis_mt.x \ - \ +blis-gemm-ind-mt: \ test_cgemm_3mhw_blis_mt.x \ test_zgemm_3mhw_blis_mt.x \ test_cgemm_3m1_blis_mt.x \ @@ -260,9 +278,7 @@ blis-gemm-mt: \ test_cgemm_4m1a_blis_mt.x \ test_zgemm_4m1a_blis_mt.x \ test_cgemm_1m_blis_mt.x \ - test_zgemm_1m_blis_mt.x \ - test_cgemm_asm_blis_mt.x \ - test_zgemm_asm_blis_mt.x + test_zgemm_1m_blis_mt.x openblas-gemm-st: \ test_sgemm_openblas_st.x \ From 965db85d29977d228ea744581edf2b682eb8e8a8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 1 Jun 2018 12:32:15 -0500 Subject: [PATCH 07/37] Updated macro invocations in bli_gemm_ker_var2.c. Details: - Updated "get next a/b micropanel" macro invocations in bli_gemm_ker_var2.c according to changes in 9588625. - Comment update in bli_cntx.c. --- frame/base/bli_cntx.c | 2 +- sandbox/ref99/vars/blx_gemm_ker_var2.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index db2d73953..981b01c3e 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -988,7 +988,7 @@ void bli_cntx_set_thrloop_from_env ); } } - else // if ( l3_op == BLIS_TRSM ) + else // if ( l3_op == BLIS_GEMM || l3_op == BLIS_HERK ) { bli_cntx_set_thrloop ( diff --git a/sandbox/ref99/vars/blx_gemm_ker_var2.c b/sandbox/ref99/vars/blx_gemm_ker_var2.c index bfc248d12..1942f987c 100644 --- a/sandbox/ref99/vars/blx_gemm_ker_var2.c +++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c @@ -283,11 +283,11 @@ void PASTECH2(blx_,ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \ + a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \ + b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ From f97a86f322a6e3e31f33c89befc66189b0b8c64f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 2 Jun 2018 20:28:20 -0500 Subject: [PATCH 08/37] Updated setting/querying pack schema (cntx->cntl). - Query pack schemas in level-3 bli_*_front() functions and store those values in the schema bitfields of the correponding obj_t's when the cntx's method is not BLIS_NAT. (When method is BLIS_NAT, the default native schemas are stored to the obj_t's.) - In bli_l3_cntl_create_if(), query the schemas stored to the obj_t's in bli_*_front(), clear the schema bitfields, and pass the queried values into bli_gemm_cntl_create() and bli_trsm_cntl_create(). - Updated APIs for bli_gemm_cntl_create() and bli_trsm_cntl_create() to take schemas for A and B, and use these values to initialize the appropriate control tree nodes. (Also cpp-disabled the panel-block cntl tree creation variant, bli_gemmpb_cntl_create(), as it has not been employed by BLIS in quite some time.) - Simplified querying of schema in bli_packm_init() thanks to above changes. - Updated openmp and pthreads definitions of bli_l3_thread_decorator() so that thread-local aliases of matrix operands are guaranteed, even if aliasing is disabled within the internal back-end functions (e.g. bli_gemm_int.c). Also added a comment to bli_thrcomm_single.c explaining why the extra aliasing is not needed there. - Change bli_gemm() and level-3 friends so that the operation's ind() function is called only if all matrix operands have the same datatype, and only if that datatype is complex. The former condition is needed in preparation for work related to mixed domain operands, while the latter helps with readability, especially for those who don't want to venture into frame/ind. - Reshuffled arguments in bli_cntx_set_thrloop_from_env() to be consistent with BLIS calling conventions (modified argument(s) are last), and updated all invocations in the level-3 _front() functions. - Comment updates to bli_cntx_set_thrloop_from_env(). --- frame/1m/packm/bli_packm_init.c | 53 +++++++++---- frame/3/bli_l3_cntl.c | 19 ++++- frame/3/bli_l3_oapi.c | 114 ++++++++++++++++++---------- frame/3/gemm/bli_gemm_cntl.c | 19 +++-- frame/3/gemm/bli_gemm_cntl.h | 12 ++- frame/3/gemm/bli_gemm_front.c | 32 +++++++- frame/3/hemm/bli_hemm_front.c | 32 +++++++- frame/3/her2k/bli_her2k_front.c | 36 ++++++++- frame/3/herk/bli_herk_front.c | 32 +++++++- frame/3/symm/bli_symm_front.c | 32 +++++++- frame/3/syr2k/bli_syr2k_front.c | 36 ++++++++- frame/3/syrk/bli_syrk_front.c | 32 +++++++- frame/3/trmm/bli_trmm_front.c | 32 +++++++- frame/3/trmm3/bli_trmm3_front.c | 32 +++++++- frame/3/trsm/bli_trsm_cntl.c | 24 +++--- frame/3/trsm/bli_trsm_cntl.h | 10 ++- frame/3/trsm/bli_trsm_front.c | 32 +++++++- frame/base/bli_cntx.c | 6 +- frame/base/bli_cntx.h | 15 ++-- frame/thread/bli_thrcomm_openmp.c | 17 +++-- frame/thread/bli_thrcomm_pthreads.c | 17 +++-- frame/thread/bli_thrcomm_single.c | 6 ++ 22 files changed, 503 insertions(+), 137 deletions(-) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 404498d60..0437b722a 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -56,8 +56,8 @@ siz_t bli_packm_init bool_t does_invert_diag; bool_t rev_iter_if_upper; bool_t rev_iter_if_lower; - //pack_t pack_schema; - packbuf_t pack_buf_type; + pack_t schema; + //packbuf_t pack_buf_type; siz_t size_needed; // Check parameters. @@ -70,8 +70,8 @@ siz_t bli_packm_init does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - //pack_schema = bli_cntl_packm_params_pack_schema( cntl ); - pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + schema = bli_cntl_packm_params_pack_schema( cntl ); + //pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); #if 0 // Let us now check to see if the object has already been packed. First @@ -112,30 +112,51 @@ siz_t bli_packm_init return 0; } - // We now ignore the pack_schema field in the control tree and - // extract the schema from the context, depending on whether we are - // preparing to pack a block of A or panel of B. For A and B, we must - // obtain the schema from the context since the induced methods reuse - // the same control trees used by native execution, and those induced - // methods specify the schema used by the current execution phase - // within the context (whereas the control tree does not change). +#if 0 pack_t schema; - if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) + if ( bli_cntx_method( cntx ) != BLIS_NAT ) { - schema = bli_cntx_schema_a_block( cntx ); + // We now ignore the pack_schema field in the control tree and + // extract the schema from the context, depending on whether we are + // preparing to pack a block of A or panel of B. For A and B, we must + // obtain the schema from the context since the induced methods reuse + // the same control trees used by native execution, and those induced + // methods specify the schema used by the current execution phase + // within the context (whereas the control tree does not change). + + if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) + { + schema = bli_cntx_schema_a_block( cntx ); + } + else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) + { + schema = bli_cntx_schema_b_panel( cntx ); + } + else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) + { + schema = bli_cntl_packm_params_pack_schema( cntl ); + } } - else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) + else // ( bli_cntx_method( cntx ) == BLIS_NAT ) { - schema = bli_cntx_schema_b_panel( cntx ); + // For native execution, we obtain the schema from the control tree + // node. (Notice that it doesn't matter if the pack_buf_type is for + // A or B.) + schema = bli_cntl_packm_params_pack_schema( cntl ); } - else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) + // This is no longer needed now that we branch between native and + // non-native cases above. +#if 0 + if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { // If we get a request to pack C for some reason, it is likely // not part of an induced method, and so it would be safe (and // necessary) to read the pack schema from the control tree. schema = bli_cntl_packm_params_pack_schema( cntl ); } +#endif +#endif // Prepare a few other variables based on properties of the control // tree. diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 0ea06715a..33c64edcb 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -45,6 +45,21 @@ void bli_l3_cntl_create_if cntl_t** cntl_use ) { + // This is part of a hack to support mixed domain in bli_gemm_front(). + // Sometimes we need to specify a non-standard schema for A and B, and + // we decided to transmit them via the schema field in the obj_t's + // rather than pass them in as function parameters. Once the values + // have been read, we immediately reset them back to their expected + // values for unpacked objects. Notice that we do this even if the + // caller passed in a custom control tree; that's because we still need + // to reset the pack schema of a and b, which were modified by the + // operation's _front() function. + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); + // If the control tree pointer is NULL, we construct a default // tree as a function of the operation family. if ( cntl_orig == NULL ) @@ -53,7 +68,7 @@ void bli_l3_cntl_create_if family == BLIS_HERK || family == BLIS_TRMM ) { - *cntl_use = bli_gemm_cntl_create( family ); + *cntl_use = bli_gemm_cntl_create( family, schema_a, schema_b ); } else // if ( family == BLIS_TRSM ) { @@ -62,7 +77,7 @@ void bli_l3_cntl_create_if if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT; else side = BLIS_RIGHT; - *cntl_use = bli_trsm_cntl_create( side ); + *cntl_use = bli_trsm_cntl_create( side, schema_a, schema_b ); } } else diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 5f4bc9932..94e563c24 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -57,20 +57,25 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - /* Invoke the operation's "ind" function--its induced method front-end. - This function will call native execution for real domain problems. - For complex problems, it calls the highest priority induced method - that is available (ie: implemented and enabled), and if none are - enabled, it calls native execution. */ \ - PASTEMAC(opname,ind) \ - ( \ - alpha, \ - a, \ - b, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_dt( b ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \ + } \ } GENFRONT( gemm ) @@ -96,16 +101,25 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - side, \ - alpha, \ - a, \ - b, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_dt( b ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( side, alpha, a, b, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( side, alpha, a, b, beta, c, cntx ); \ + } \ } GENFRONT( hemm ) @@ -129,14 +143,24 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - alpha, \ - a, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( alpha, a, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( alpha, a, beta, c, cntx ); \ + } \ } GENFRONT( herk ) @@ -159,14 +183,24 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - side, \ - alpha, \ - a, \ - b, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \ + bli_obj_is_complex( b ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( side, alpha, a, b, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( side, alpha, a, b, cntx ); \ + } \ } GENFRONT( trmm ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b17ce10ac..3e13f23fa 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -36,17 +36,21 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { - return bli_gemmbp_cntl_create( family ); + return bli_gemmbp_cntl_create( family, schema_a, schema_b ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_gemm_ker_var2; @@ -82,7 +86,7 @@ cntl_t* bli_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); @@ -106,7 +110,7 @@ cntl_t* bli_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_bp ); @@ -134,6 +138,10 @@ cntl_t* bli_gemmbp_cntl_create // ----------------------------------------------------------------------------- +// This control tree creation function is disabled because it is no longer used. +// (It was originally created in the run up to publishing the 1m journal article, +// but was disabled to reduce complexity.) +#if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family @@ -223,6 +231,7 @@ cntl_t* bli_gemmpb_cntl_create return gemm_cntl_vl_mm; } +#endif // ----------------------------------------------------------------------------- diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 3b643e1fc..3b3cb1cf2 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -34,20 +34,26 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); +#if 0 cntl_t* bli_gemmpb_cntl_create ( - opid_t family + opid_t family, ); +#endif // ----------------------------------------------------------------------------- diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index f2600d791..8aae5b476 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -87,10 +87,34 @@ void bli_gemm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 2406ee1d5..b12424d63 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -88,10 +88,34 @@ void bli_hemm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 9448b881e..15ee65fad 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -106,10 +106,38 @@ void bli_her2k_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HER2K, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bh_local ); + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &bh_local ); + bli_obj_set_pack_schema( schema_a, &b_local ); + bli_obj_set_pack_schema( schema_b, &ah_local ); + } // Invoke herk twice, using beta only the first time. diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 8b2379a66..f6e5b55a3 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -86,10 +86,34 @@ void bli_herk_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HERK, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &ah_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 0c229ef9b..84263bc9d 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -87,10 +87,34 @@ void bli_symm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 32981cb89..769ca56a0 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -87,10 +87,38 @@ void bli_syr2k_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYR2K, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bt_local ); + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &bt_local ); + bli_obj_set_pack_schema( schema_a, &b_local ); + bli_obj_set_pack_schema( schema_b, &at_local ); + } // Invoke herk twice, using beta only the first time. diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index eed5f0ebc..7a66ad68a 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -80,10 +80,34 @@ void bli_syrk_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYRK, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &at_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index afdabbbd2..935972442 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -130,10 +130,34 @@ void bli_trmm_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRMM, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 664a7fd51..0f772f0fb 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -129,10 +129,34 @@ void bli_trmm3_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRMM3, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index e05fc3d20..df9b831a3 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -36,16 +36,21 @@ cntl_t* bli_trsm_cntl_create ( - side_t side + side_t side, + pack_t schema_a, + pack_t schema_b ) { - if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create(); - else return bli_trsm_r_cntl_create(); + if ( bli_is_left( side ) ) + return bli_trsm_l_cntl_create( schema_a, schema_b ); + else + return bli_trsm_r_cntl_create( schema_a, schema_b ); } cntl_t* bli_trsm_l_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_trsm_xx_ker_var2; @@ -79,7 +84,7 @@ cntl_t* bli_trsm_l_cntl_create TRUE, // do NOT invert diagonal TRUE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); @@ -103,7 +108,7 @@ cntl_t* bli_trsm_l_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); @@ -131,7 +136,8 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_trsm_xx_ker_var2; @@ -165,7 +171,7 @@ cntl_t* bli_trsm_r_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); @@ -189,7 +195,7 @@ cntl_t* bli_trsm_r_cntl_create TRUE, // do NOT invert diagonal FALSE, // reverse iteration if upper? TRUE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index cfd20cad3..77c36aec2 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -34,17 +34,21 @@ cntl_t* bli_trsm_cntl_create ( - side_t side + side_t side, + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_l_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_r_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ); void bli_trsm_cntl_free diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 2bc6d0186..081a2c284 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -121,10 +121,34 @@ void bli_trsm_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRSM, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 981b01c3e..d36a20ded 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -869,10 +869,10 @@ void bli_cntx_set_thrloop_from_env ( opid_t l3_op, side_t side, - cntx_t* cntx, dim_t m, dim_t n, - dim_t k + dim_t k, + cntx_t* cntx ) { dim_t jc, pc, ic, jr, ir; @@ -988,7 +988,7 @@ void bli_cntx_set_thrloop_from_env ); } } - else // if ( l3_op == BLIS_GEMM || l3_op == BLIS_HERK ) + else // any other level-3 operation besides trmm/trsm { bli_cntx_set_thrloop ( diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index ac43312bc..4aaec97c4 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -674,12 +674,15 @@ void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); void bli_cntx_set_packm_kers( dim_t n_kers, ... ); -void bli_cntx_set_thrloop_from_env( opid_t l3_op, - side_t side, - cntx_t* cntx, - dim_t m, - dim_t n, - dim_t k ); +void bli_cntx_set_thrloop_from_env + ( + opid_t l3_op, + side_t side, + dim_t m, + dim_t n, + dim_t k, + cntx_t* cntx + ); void bli_cntx_print( cntx_t* cntx ); diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 131f70973..f2197597f 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -231,11 +231,18 @@ void bli_l3_thread_decorator { dim_t id = omp_get_thread_num(); + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; + // Alias thread-local copies of A, B, and C. These will be the objects + // we pass into the thread functions. + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + bli_obj_alias_to( c, &c_t ); + // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -243,17 +250,17 @@ void bli_l3_thread_decorator func ( alpha, - a, - b, + &a_t, + &b_t, beta, - c, + &c_t, cntx, cntl_use, thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread ); #ifdef PRINT_THRINFO threads[id] = thread; diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index e2fa35c35..132fb6740 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -161,11 +161,18 @@ void* bli_l3_thread_entry( void* data_void ) dim_t id = data->id; thrcomm_t* gl_comm = data->gl_comm; + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; + // Alias thread-local copies of A, B, and C. These will be the objects + // we pass into the thread function. + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + bli_obj_alias_to( c, &c_t ); + // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -173,17 +180,17 @@ void* bli_l3_thread_entry( void* data_void ) func ( alpha, - a, - b, + &a_t, + &b_t, beta, - c, + &c_t, cntx, cntl_use, thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index cb0bc2ae4..068b7eda5 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -94,6 +94,12 @@ void bli_l3_thread_decorator cntl_t* cntl_use; thrinfo_t* thread; + // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't + // need to alias objects for A, B, and C since they were already aliased + // in bli_*_front(). (We only needed thread-local copies so each could + // safely reset their internal (beta) scalars on c after the first + // iteration of the pc (kc) loop.) + // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); From d7fb32682057c7458c8891c0eedafc374fd9beef Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 3 Jun 2018 13:20:37 -0500 Subject: [PATCH 09/37] Fixed syntax artifacts from 4b36e85 in examples. Details: - Fixed artifacts of malformed recursive sed expressions used when preparing 4b36e85, in which most function-like macros were converted to static functions. The syntactically defective code was contained entirely in examples/oapi. Thanks to Tony Skjellum for reporting this issue. - Update to CREDITS file. --- CREDITS | 1 + examples/oapi/0obj_basic.c | 8 ++++---- examples/oapi/2obj_ij.c | 14 +++++++------- examples/oapi/6level1m_diag.c | 8 ++++---- examples/oapi/7level2.c | 8 ++++---- examples/oapi/8level3.c | 8 ++++---- examples/oapi/9util.c | 12 ++++++------ 7 files changed, 30 insertions(+), 29 deletions(-) diff --git a/CREDITS b/CREDITS index 342f930f6..4382a5e6d 100644 --- a/CREDITS +++ b/CREDITS @@ -48,6 +48,7 @@ but many others have contributed code and feedback, including Rene Sitt Martin Schatz (The University of Texas at Austin) Nico Schlömer + Tony Skjellum (The University of Tennessee at Chattanooga) Mikhail Smelyanskiy (Intel, Parallel Computing Lab) Shaden Smith Tyler Smith (The University of Texas at Austin) diff --git a/examples/oapi/0obj_basic.c b/examples/oapi/0obj_basic.c index 38c2e7047..4cf1565b6 100644 --- a/examples/oapi/0obj_basic.c +++ b/examples/oapi/0obj_basic.c @@ -122,7 +122,7 @@ int main( int argc, char** argv ) // Let's inspect the amount of padding inserted for alignment. Note // the difference between the m dimension and the column stride. printf( "datatype %s\n", bli_dt_string( bli_obj_dt( &a8 ) ) ); - printf( "datatype size %d bytes\n", bli_dt_size( bli_obj_dt( &a8 ) ) ); + printf( "datatype size %d bytes\n", ( int )bli_dt_size( bli_obj_dt( &a8 ) ) ); printf( "m dim (# of rows): %d\n", ( int )bli_obj_length( &a8 ) ); printf( "n dim (# of cols): %d\n", ( int )bli_obj_width( &a8 ) ); printf( "row stride: %d\n", ( int )bli_obj_row_stride( &a8 ) ); @@ -140,7 +140,7 @@ int main( int argc, char** argv ) bli_obj_create( BLIS_DCOMPLEX, 3, 5, 0, 0, &a11 ); printf( "datatype %s\n", bli_dt_string( bli_obj_dt( &a9 ) ) ); - printf( "datatype size %d bytes\n", bli_dt_size( bli_obj_dt( &a9 ) ) ); + printf( "datatype size %d bytes\n", ( int )bli_dt_size( bli_obj_dt( &a9 ) ) ); printf( "m dim (# of rows): %d\n", ( int )bli_obj_length( &a9 ) ); printf( "n dim (# of cols): %d\n", ( int )bli_obj_width( &a9 ) ); printf( "row stride: %d\n", ( int )bli_obj_row_stride( &a9 ) ); @@ -148,7 +148,7 @@ int main( int argc, char** argv ) printf( "\n" ); printf( "datatype %s\n", bli_dt_string( bli_obj_dt( &a10 ) ) ); - printf( "datatype size %d bytes\n", bli_dt_size( bli_obj_dt( &a10 ) ) ); + printf( "datatype size %d bytes\n", ( int )bli_dt_size( bli_obj_dt( &a10 ) ) ); printf( "m dim (# of rows): %d\n", ( int )bli_obj_length( &a10 ) ); printf( "n dim (# of cols): %d\n", ( int )bli_obj_width( &a10 ) ); printf( "row stride: %d\n", ( int )bli_obj_row_stride( &a10 ) ); @@ -156,7 +156,7 @@ int main( int argc, char** argv ) printf( "\n" ); printf( "datatype %s\n", bli_dt_string( bli_obj_dt( &a11 ) ) ); - printf( "datatype size %d bytes\n", bli_dt_size( bli_obj_dt( &a11 ) ) ); + printf( "datatype size %d bytes\n", ( int )bli_dt_size( bli_obj_dt( &a11 ) ) ); printf( "m dim (# of rows): %d\n", ( int )bli_obj_length( &a11 ) ); printf( "n dim (# of cols): %d\n", ( int )bli_obj_width( &a11 ) ); printf( "row stride: %d\n", ( int )bli_obj_row_stride( &a11 ) ); diff --git a/examples/oapi/2obj_ij.c b/examples/oapi/2obj_ij.c index 0a15ac8a4..322b7eff5 100644 --- a/examples/oapi/2obj_ij.c +++ b/examples/oapi/2obj_ij.c @@ -83,18 +83,18 @@ int main( int argc, char** argv ) bli_getijm( i, j, &a1, &alpha_r, &alpha_i ); // Here, we print out the element "returned" by bli_getijm(). - printf( "element (%2d,%2d) of matrix 'a1' (real + imag): %5.1f + %5.1f\n", i, j, alpha_r, alpha_i ); + printf( "element (%2d,%2d) of matrix 'a1' (real + imag): %5.1f + %5.1f\n", ( int )i, ( int )j, alpha_r, alpha_i ); // Let's query a few more elements. i = 0; j = 2; bli_getijm( i, j, &a1, &alpha_r, &alpha_i ); - printf( "element (%2d,%2d) of matrix 'a1' (real + imag): %5.1f + %5.1f\n", i, j, alpha_r, alpha_i ); + printf( "element (%2d,%2d) of matrix 'a1' (real + imag): %5.1f + %5.1f\n", ( int )i, ( int )j, alpha_r, alpha_i ); i = 3; j = 4; bli_getijm( i, j, &a1, &alpha_r, &alpha_i ); - printf( "element (%2d,%2d) of matrix 'a1' (real + imag): %5.1f + %5.1f\n", i, j, alpha_r, alpha_i ); + printf( "element (%2d,%2d) of matrix 'a1' (real + imag): %5.1f + %5.1f\n", ( int )i, ( int )j, alpha_r, alpha_i ); printf( "\n" ); @@ -224,8 +224,8 @@ void init_dmatrix_by_cols( dim_t m, dim_t n, double* a, inc_t rs, inc_t cs ) void init_dobj_by_cols( obj_t* a ) { - dim_t m = bli_obj_length( *a ); - dim_t n = bli_obj_width( *a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); dim_t i, j; double alpha = 0.0; @@ -245,8 +245,8 @@ void init_dobj_by_cols( obj_t* a ) void init_zobj_by_cols( obj_t* a ) { - dim_t m = bli_obj_length( *a ); - dim_t n = bli_obj_width( *a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); dim_t i, j; double alpha = 0.0; diff --git a/examples/oapi/6level1m_diag.c b/examples/oapi/6level1m_diag.c index 130311a6c..14ee8d902 100644 --- a/examples/oapi/6level1m_diag.c +++ b/examples/oapi/6level1m_diag.c @@ -59,7 +59,7 @@ int main( int argc, char** argv ) bli_obj_create( dt, m, n, rs, cs, &a ); // First, we mark the matrix structure as triangular. - bli_obj_set_struc( BLIS_TRIANGULAR, &a ) + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); // Next, we specify whether the lower part or the upper part is to be // recognized as the "stored" region (which we call the uplo field). The @@ -89,7 +89,7 @@ int main( int argc, char** argv ) bli_obj_create( dt, m, n, rs, cs, &b ); // Set structure and uplo. - bli_obj_set_struc( BLIS_TRIANGULAR, &b ) + bli_obj_set_struc( BLIS_TRIANGULAR, &b ); bli_obj_set_uplo( BLIS_UPPER, &b ); // Create an alias, 'bl', of the original object 'b'. Both objects will @@ -245,7 +245,7 @@ int main( int argc, char** argv ) bli_obj_alias_to( &e, &el ); // Set structure and uplo of 'el'. - bli_obj_set_struc( BLIS_TRIANGULAR, &el ) + bli_obj_set_struc( BLIS_TRIANGULAR, &el ); bli_obj_set_uplo( BLIS_LOWER, &el ); // Digression: Notice that "triangular" structure does not require that @@ -290,7 +290,7 @@ int main( int argc, char** argv ) bli_obj_set_diag_offset( -1, &h ); // Set the structure and uplo of 'h'. - bli_obj_set_struc( BLIS_TRIANGULAR, &h ) + bli_obj_set_struc( BLIS_TRIANGULAR, &h ); bli_obj_set_uplo( BLIS_UPPER, &h ); // Randomize the elements on and above the first subdiagonal. diff --git a/examples/oapi/7level2.c b/examples/oapi/7level2.c index 6008a3a9b..1954f42e0 100644 --- a/examples/oapi/7level2.c +++ b/examples/oapi/7level2.c @@ -157,7 +157,7 @@ int main( int argc, char** argv ) // Mark matrix 'a' as symmetric and stored in the lower triangle, and // then randomize that lower triangle. - bli_obj_set_struc( BLIS_SYMMETRIC, &a ) + bli_obj_set_struc( BLIS_SYMMETRIC, &a ); bli_obj_set_uplo( BLIS_LOWER, &a ); bli_randm( &a ); @@ -200,7 +200,7 @@ int main( int argc, char** argv ) // Mark matrix 'a' as symmetric and stored in the upper triangle, and // then randomize that upper triangle. - bli_obj_set_struc( BLIS_SYMMETRIC, &a ) + bli_obj_set_struc( BLIS_SYMMETRIC, &a ); bli_obj_set_uplo( BLIS_UPPER, &a ); bli_randm( &a ); @@ -242,7 +242,7 @@ int main( int argc, char** argv ) // Mark matrix 'a' as triangular and stored in the lower triangle, and // then randomize that lower triangle. - bli_obj_set_struc( BLIS_TRIANGULAR, &a ) + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( BLIS_LOWER, &a ); bli_randm( &a ); @@ -283,7 +283,7 @@ int main( int argc, char** argv ) // Mark matrix 'a' as triangular and stored in the lower triangle, and // then randomize that lower triangle. - bli_obj_set_struc( BLIS_TRIANGULAR, &a ) + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( BLIS_LOWER, &a ); bli_randm( &a ); diff --git a/examples/oapi/8level3.c b/examples/oapi/8level3.c index a1fd55bce..ff850e2a4 100644 --- a/examples/oapi/8level3.c +++ b/examples/oapi/8level3.c @@ -148,7 +148,7 @@ int main( int argc, char** argv ) // Mark matrix 'c' as symmetric and stored in the lower triangle, and // then randomize that lower triangle. - bli_obj_set_struc( BLIS_SYMMETRIC, &c ) + bli_obj_set_struc( BLIS_SYMMETRIC, &c ); bli_obj_set_uplo( BLIS_LOWER, &c ); bli_randm( &c ); @@ -194,7 +194,7 @@ int main( int argc, char** argv ) // Mark matrix 'a' as symmetric and stored in the upper triangle, and // then randomize that upper triangle. - bli_obj_set_struc( BLIS_SYMMETRIC, &a ) + bli_obj_set_struc( BLIS_SYMMETRIC, &a ); bli_obj_set_uplo( BLIS_UPPER, &a ); bli_randm( &a ); @@ -241,7 +241,7 @@ int main( int argc, char** argv ) // Mark matrix 'a' as triangular and stored in the lower triangle, and // then randomize that lower triangle. - bli_obj_set_struc( BLIS_TRIANGULAR, &a ) + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( BLIS_LOWER, &a ); bli_randm( &a ); @@ -286,7 +286,7 @@ int main( int argc, char** argv ) // Mark matrix 'a' as triangular and stored in the lower triangle, and // then randomize that lower triangle. - bli_obj_set_struc( BLIS_TRIANGULAR, &a ) + bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( BLIS_LOWER, &a ); bli_randm( &a ); diff --git a/examples/oapi/9util.c b/examples/oapi/9util.c index 61042c39f..55366b0f6 100644 --- a/examples/oapi/9util.c +++ b/examples/oapi/9util.c @@ -147,7 +147,7 @@ int main( int argc, char** argv ) bli_setm( &BLIS_MINUS_ONE, &c ); // Set the structure and uplo of 'c'. - bli_obj_set_struc( BLIS_SYMMETRIC, &c ) + bli_obj_set_struc( BLIS_SYMMETRIC, &c ); bli_obj_set_uplo( BLIS_LOWER, &c ); // Randomize the lower triangle of 'c'. @@ -170,7 +170,7 @@ int main( int argc, char** argv ) // Initialize all of 'd' to -1.0 to simulate junk values. bli_setm( &BLIS_MINUS_ONE, &d ); - bli_obj_set_struc( BLIS_HERMITIAN, &d ) + bli_obj_set_struc( BLIS_HERMITIAN, &d ); bli_obj_set_uplo( BLIS_LOWER, &d ); // Randomize the lower triangle of 'd'. @@ -185,7 +185,7 @@ int main( int argc, char** argv ) bli_printm( "d (after mkherm):", &d, "%4.1f", "" ); // Set the structure and uplo of 'd'. - bli_obj_set_struc( BLIS_HERMITIAN, &d ) + bli_obj_set_struc( BLIS_HERMITIAN, &d ); bli_obj_set_uplo( BLIS_LOWER, &d ); // @@ -203,7 +203,7 @@ int main( int argc, char** argv ) bli_setm( &BLIS_MINUS_ONE, &e ); // Set the structure and uplo of 'e'. - bli_obj_set_struc( BLIS_SYMMETRIC, &e ) + bli_obj_set_struc( BLIS_SYMMETRIC, &e ); bli_obj_set_uplo( BLIS_UPPER, &e ); // Randomize the upper triangle of 'e'. @@ -221,7 +221,7 @@ int main( int argc, char** argv ) bli_setm( &BLIS_MINUS_ONE, &f ); // Set the structure and uplo of 'f'. - bli_obj_set_struc( BLIS_HERMITIAN, &f ) + bli_obj_set_struc( BLIS_HERMITIAN, &f ); bli_obj_set_uplo( BLIS_UPPER, &f ); // Randomize the upper triangle of 'f'. @@ -249,7 +249,7 @@ int main( int argc, char** argv ) bli_setm( &BLIS_MINUS_ONE, &g ); // Set the structure and uplo of 'g'. - bli_obj_set_struc( BLIS_TRIANGULAR, &g ) + bli_obj_set_struc( BLIS_TRIANGULAR, &g ); bli_obj_set_uplo( BLIS_LOWER, &g ); // Randomize the lower triangle of 'g'. From 7a207e8f2c5046f8b295a78e029ff2de765c7409 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 3 Jun 2018 18:04:27 -0500 Subject: [PATCH 10/37] Disabled indirect blacklisting (issue #214). Details: - Return early from function, pass_config_kernel_registries(), that implements indirect blacklisting of subconfigurations (during pass 0). In short, I realized that indirect blacklisting is not needed in the situations I envisioned, and can actually cause problems under certain circumstances. Thanks to Tony Skjellum for reporting the issue (#214) that led to this commit, and to Devin Matthews for prompting me to realize that indirect blacklisting was unnecessary, at least as originally envisioned. --- configure | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/configure b/configure index d1ee5527a..c09bbff78 100755 --- a/configure +++ b/configure @@ -291,16 +291,15 @@ pass_config_kernel_registries() passnum="$2" # Initialize a list of indirect blacklisted configurations for the - # current iteration. These are configurations that are invalidated - # by the removal of blacklisted configurations. For example, if - # haswell is registered as needing the 'haswell' and 'zen' kernel - # sets: + # current iteration. These are configurations that are invalidated by + # the removal of blacklisted configurations. For example, if haswell + # is registered as needing the 'haswell' and 'zen' kernel sets: # # haswell: haswell/haswell/zen # - # and 'zen' was blacklisted because of the compiler version, then - # the 'haswell' configuration must be omitted from the registry, - # as it no longer has all of the kernel sets it was expecting. + # and 'zen' was blacklisted because of the compiler version, then the + # 'haswell' configuration must be omitted from the registry, as it no + # longer has all of the kernel sets it was expecting. if [ "${passnum}" == "0" ]; then indirect_blist="" fi @@ -310,6 +309,22 @@ pass_config_kernel_registries() # indirect_blist is still empty. all_blist="${config_blist} ${indirect_blist}" + # Disable support for indirect blacklisting by returning early during + # pass 0. See issue #214 for details [1]. Basically, I realized that + # indirect blacklisting is not needed in the use case that I envisioned + # in the real-life example above. If a subconfiguration such as haswell + # is defined to require the zen kernel set, it implies that the zen + # kernels can be compiled with haswell compiler flags. That is, just + # because the zen subconfig (and its compiler flags) is blacklisted + # does not mean that the haswell subconfig cannot compile the zen + # kernels with haswell-specific flags. + # + # [1] https://github.com/flame/blis/issues/214 + # + if [ "${passnum}" == "0" ]; then + return + fi + while read -r line do curline="${line}" @@ -1117,7 +1132,8 @@ check_compiler() # [1] While gcc 6.0 or newer is needed for zen support (-march=znver1), # we relax this compiler version constraint a bit by targeting bdver4 # and then disabling the instruction sets that were removed in the - # transition from bdver4 to znver1. + # transition from bdver4 to znver1. (See config/zen/make_defs.mk for + # the specific compiler flags used.) # [2] https://github.com/devinamatthews/tblis/ # @@ -1756,6 +1772,9 @@ main() echo "done." # Report if additional configurations needed to be blacklisted. + # NOTE: This branch should never execute so long as indirect blacklisting + # is disabled. See comment regarding issue #214 in the definition of + # pass_config_kernel_registries(). if [ -n "${indirect_blist}" ]; then echo "${script_name}: needed to indirectly blacklist additional configurations:" echo "${script_name}: ${indirect_blist}" From 2c6d99b99e50d70f904da298a0c59be16cc5c180 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 3 Jun 2018 18:13:36 -0500 Subject: [PATCH 11/37] Fixed names out of alphabetical order in CREDITS. --- CREDITS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CREDITS b/CREDITS index 4382a5e6d..28770a8bd 100644 --- a/CREDITS +++ b/CREDITS @@ -10,9 +10,9 @@ The BLIS framework was primarily authored by but many others have contributed code and feedback, including Murtaza Ali (Texas Instruments) + Erling Andersen Alex Arslan Vernon Austel (IBM, T.J. Watson Research Center) - Erling Andersen Jed Brown (Argonne National Laboratory) Johannes Dieterich Krzysztof Drewniak @@ -39,15 +39,15 @@ but many others have contributed code and feedback, including Devangi Parikh (The University of Texas at Austin) Elmar Peise (RWTH-Aachen) Clément Pernet - Jack Poulson (Stanford) Ilya Polkovnichenko + Jack Poulson (Stanford) Michael Rader Pradeep Rao (AMD) Aleksei Rechinskii Karl Rupp - Rene Sitt Martin Schatz (The University of Texas at Austin) Nico Schlömer + Rene Sitt Tony Skjellum (The University of Tennessee at Chattanooga) Mikhail Smelyanskiy (Intel, Parallel Computing Lab) Shaden Smith From bd02c4e9f7fe07487276e61507335d48c8e05f35 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 4 Jun 2018 13:42:17 -0500 Subject: [PATCH 12/37] Cleanups to testsuite, input.operations format. Details: - Removed the line in each operation entry in input.operations titled "test sequential front-end" and the corresponding support for the lines in the testsuite input parsing code. This line was included in the some of the earliest versions of the testsuite, back when I intended to eventually have separate multithreaded APIs. Specifically, I envisioned that multithreaded and sequential testing could be enabled or disabled on an operation level. However, BLIS evolved in a different direction and still does not have multithreaded-specific APIs (even if it will eventually someday). But even if it did have such APIs, I doubt I would allow the user to enable/disable them on an operation level. Thus, this was a zombie future parameter that was never used and never made sense to begin with. The one instance of the front_seq variable, used in the various libblis_test_() functions to guard the call to the operation test driver, that remains was commented out instead of deleted so that someday it could be easily changed via sed, if desired. - Various minor cleanups to the testsuite code, including consolidating use of DISABLE and DISABLE_ALL and reexpressing certain conditional expressions in the libblis_test_() functions in terms of boolean functions. --- testsuite/input.operations | 58 +------------------------- testsuite/input.operations.fast | 55 +------------------------ testsuite/src/test_addm.c | 6 +-- testsuite/src/test_addv.c | 6 +-- testsuite/src/test_amaxv.c | 6 +-- testsuite/src/test_axpbyv.c | 6 +-- testsuite/src/test_axpy2v.c | 6 +-- testsuite/src/test_axpyf.c | 6 +-- testsuite/src/test_axpym.c | 6 +-- testsuite/src/test_axpyv.c | 6 +-- testsuite/src/test_copym.c | 6 +-- testsuite/src/test_copyv.c | 6 +-- testsuite/src/test_dotaxpyv.c | 6 +-- testsuite/src/test_dotv.c | 6 +-- testsuite/src/test_dotxaxpyf.c | 6 +-- testsuite/src/test_dotxf.c | 6 +-- testsuite/src/test_dotxv.c | 6 +-- testsuite/src/test_gemm.c | 6 +-- testsuite/src/test_gemm_ukr.c | 6 +-- testsuite/src/test_gemmtrsm_ukr.c | 6 +-- testsuite/src/test_gemv.c | 6 +-- testsuite/src/test_ger.c | 6 +-- testsuite/src/test_hemm.c | 6 +-- testsuite/src/test_hemv.c | 6 +-- testsuite/src/test_her.c | 6 +-- testsuite/src/test_her2.c | 6 +-- testsuite/src/test_her2k.c | 6 +-- testsuite/src/test_herk.c | 6 +-- testsuite/src/test_libblis.c | 67 +++++++++++++++++++++++-------- testsuite/src/test_libblis.h | 12 +++++- testsuite/src/test_normfm.c | 6 +-- testsuite/src/test_normfv.c | 6 +-- testsuite/src/test_randm.c | 6 +-- testsuite/src/test_randv.c | 6 +-- testsuite/src/test_scal2m.c | 6 +-- testsuite/src/test_scal2v.c | 6 +-- testsuite/src/test_scalm.c | 6 +-- testsuite/src/test_scalv.c | 6 +-- testsuite/src/test_setm.c | 6 +-- testsuite/src/test_setv.c | 6 +-- testsuite/src/test_subm.c | 6 +-- testsuite/src/test_subv.c | 6 +-- testsuite/src/test_symm.c | 6 +-- testsuite/src/test_symv.c | 6 +-- testsuite/src/test_syr.c | 6 +-- testsuite/src/test_syr2.c | 6 +-- testsuite/src/test_syr2k.c | 6 +-- testsuite/src/test_syrk.c | 6 +-- testsuite/src/test_trmm.c | 6 +-- testsuite/src/test_trmm3.c | 6 +-- testsuite/src/test_trmv.c | 6 +-- testsuite/src/test_trsm.c | 6 +-- testsuite/src/test_trsm_ukr.c | 6 +-- testsuite/src/test_trsv.c | 6 +-- testsuite/src/test_xpbyv.c | 6 +-- 55 files changed, 216 insertions(+), 282 deletions(-) diff --git a/testsuite/input.operations b/testsuite/input.operations index e3cd20503..c3e6d6f16 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -21,12 +21,7 @@ # determined by its local switch. For example, if the level-1v section # override is set to 1, and there is a 1 on the line marked "addv", # then the addv operation will be tested. Similarly, a 0 would cause -# addv to not be tested. NOTE: You may ignore the lines marked "test -# sequential front-end." These lines are for future use, to -# distinguish tests of the sequential implementation from tests of -# the multithreaded implementation. For now, BLIS does not contain -# separate APIs for multithreaded execution, even though -# multithreading is supported. So, these should be left set to 1. +# addv to not be tested. # # ENABLING ONLY SELECT OPERATIONS # If you would like to enable just a few (or even just one) operation @@ -105,75 +100,60 @@ # --- Utility -------------------------------------------------------------- 1 # randv -1 # test sequential front-end -1 # dimensions: m 1 # randm -1 # test sequential front-end -1 -1 # dimensions: m n # --- Level-1v ------------------------------------------------------------- 1 # addv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # amaxv -1 # test sequential front-end -1 # dimensions: m 1 # axpbyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # axpyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # copyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # dotv -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjx conjy 1 # dotxv -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjx conjy 1 # normfv -1 # test sequential front-end -1 # dimensions: m 1 # scalv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjbeta 1 # scal2v -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # setv -1 # test sequential front-end -1 # dimensions: m 1 # subv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # xpbyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx @@ -181,40 +161,32 @@ # --- Level-1m ------------------------------------------------------------- 1 # addm -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa 1 # axpym -1 # test sequential front-end -1 -1 # dimensions: m n ? # parameters: transa 1 # copym -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa 1 # normfm -1 # test sequential front-end -1 -2 # dimensions: m n 1 # scalm -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: conjbeta 1 # scal2m -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa 1 # setm -1 # test sequential front-end -1 -2 # dimensions: m n 1 # subm -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa @@ -222,27 +194,22 @@ # --- Level-1f kernels ----------------------------------------------------- 1 # axpy2v -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjx conjy 1 # dotaxpyv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: conjxt conjx conjy 1 # axpyf -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conja conjx 1 # dotxf -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjat conjx 1 # dotxaxpyf -1 # test sequential front-end -1 # dimensions: m ???? # parameters: conjat conja conjw conjx @@ -250,52 +217,42 @@ # --- Level-2 -------------------------------------------------------------- 1 # gemv -1 # test sequential front-end -1 -2 # dimensions: m n ?? # parameters: transa conjx 1 # ger -1 # test sequential front-end -1 -2 # dimensions: m n ?? # parameters: conjx conjy 1 # hemv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa conja conjx 1 # her -1 # test sequential front-end -1 # dimensions: m ?? # parameters: uploc conjx 1 # her2 -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploc conjx conjy 1 # symv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa conja conjx 1 # syr -1 # test sequential front-end -1 # dimensions: m ?? # parameters: uploc conjx 1 # syr2 -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploc conjx conjy 1 # trmv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa transa diaga 1 # trsv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa transa diaga @@ -303,15 +260,12 @@ # --- Level-3 micro-kernels ------------------------------------------------ 1 # gemm -1 # test sequential micro-kernel -1 # dimensions: k 1 # trsm -1 # test sequential micro-kernel ? # parameters: uploa 1 # gemmtrsm -1 # test sequential micro-kernel -1 # dimensions: k ? # parameters: uploa @@ -319,52 +273,42 @@ # --- Level-3 -------------------------------------------------------------- 1 # gemm -1 # test sequential front-end -1 -1 -1 # dimensions: m n k ?? # parameters: transa transb 1 # hemm -1 # test sequential front-end -1 -1 # dimensions: m n ???? # parameters: side uploa conja transb 1 # herk -1 # test sequential front-end -1 -1 # dimensions: m k ?? # parameters: uploc transa 1 # her2k -1 # test sequential front-end -1 -1 # dimensions: m k ??? # parameters: uploc transa transb 1 # symm -1 # test sequential front-end -1 -1 # dimensions: m n ???? # parameters: side uploa conja transb 1 # syrk -1 # test sequential front-end -1 -1 # dimensions: m k ?? # parameters: uploc transa 1 # syr2k -1 # test sequential front-end -1 -1 # dimensions: m k ??? # parameters: uploc transa transb 1 # trmm -1 # test sequential front-end -1 -1 # dimensions: m n ???? # parameters: side uploa transa diaga 1 # trmm3 -1 # test sequential front-end -1 -1 # dimensions: m n ????n # parameters: side uploa transa diaga transb 1 # trsm -1 # test sequential front-end -1 -1 # dimensions: m n ???? # parameters: side uploa transa diaga diff --git a/testsuite/input.operations.fast b/testsuite/input.operations.fast index 3cf2ce52f..d86de6ecc 100644 --- a/testsuite/input.operations.fast +++ b/testsuite/input.operations.fast @@ -21,12 +21,7 @@ # determined by its local switch. For example, if the level-1v section # override is set to 1, and there is a 1 on the line marked "addv", # then the addv operation will be tested. Similarly, a 0 would cause -# addv to not be tested. NOTE: You may ignore the lines marked "test -# sequential front-end." These lines are for future use, to -# distinguish tests of the sequential implementation from tests of -# the multithreaded implementation. For now, BLIS does not contain -# separate APIs for multithreaded execution, even though -# multithreading is supported. So, these should be left set to 1. +# addv to not be tested. # # ENABLING ONLY SELECT OPERATIONS # If you would like to enable just a few (or even just one) operation @@ -105,75 +100,60 @@ # --- Utility -------------------------------------------------------------- 1 # randv -1 # test sequential front-end -1 # dimensions: m 1 # randm -1 # test sequential front-end -1 -1 # dimensions: m n # --- Level-1v ------------------------------------------------------------- 1 # addv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # amaxv -1 # test sequential front-end -1 # dimensions: m 1 # axpbyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # axpyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # copyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # dotv -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjx conjy 1 # dotxv -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjx conjy 1 # normfv -1 # test sequential front-end -1 # dimensions: m 1 # scalv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjbeta 1 # scal2v -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # setv -1 # test sequential front-end -1 # dimensions: m 1 # subv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx 1 # xpbyv -1 # test sequential front-end -1 # dimensions: m ? # parameters: conjx @@ -181,40 +161,32 @@ # --- Level-1m ------------------------------------------------------------- 1 # addm -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa 1 # axpym -1 # test sequential front-end -1 -1 # dimensions: m n ? # parameters: transa 1 # copym -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa 1 # normfm -1 # test sequential front-end -1 -2 # dimensions: m n 1 # scalm -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: conjbeta 1 # scal2m -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa 1 # setm -1 # test sequential front-end -1 -2 # dimensions: m n 1 # subm -1 # test sequential front-end -1 -2 # dimensions: m n ? # parameters: transa @@ -222,27 +194,22 @@ # --- Level-1f kernels ----------------------------------------------------- 1 # axpy2v -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjx conjy 1 # dotaxpyv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: conjxt conjx conjy 1 # axpyf -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conja conjx 1 # dotxf -1 # test sequential front-end -1 # dimensions: m ?? # parameters: conjat conjx 1 # dotxaxpyf -1 # test sequential front-end -1 # dimensions: m ???? # parameters: conjat conja conjw conjx @@ -250,52 +217,42 @@ # --- Level-2 -------------------------------------------------------------- 1 # gemv -1 # test sequential front-end -1 -2 # dimensions: m n ?? # parameters: transa conjx 1 # ger -1 # test sequential front-end -1 -2 # dimensions: m n ?? # parameters: conjx conjy 1 # hemv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa conja conjx 1 # her -1 # test sequential front-end -1 # dimensions: m ?? # parameters: uploc conjx 1 # her2 -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploc conjx conjy 1 # symv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa conja conjx 1 # syr -1 # test sequential front-end -1 # dimensions: m ?? # parameters: uploc conjx 1 # syr2 -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploc conjx conjy 1 # trmv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa transa diaga 1 # trsv -1 # test sequential front-end -1 # dimensions: m ??? # parameters: uploa transa diaga @@ -319,52 +276,42 @@ # --- Level-3 -------------------------------------------------------------- 1 # gemm -1 # test sequential front-end -1 -1 -1 # dimensions: m n k nn # parameters: transa transb 1 # hemm -1 # test sequential front-end -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb 1 # herk -1 # test sequential front-end -1 -1 # dimensions: m k ?n # parameters: uploc transa 1 # her2k -1 # test sequential front-end -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb 1 # symm -1 # test sequential front-end -1 -1 # dimensions: m n ??nn # parameters: side uploa conja transb 1 # syrk -1 # test sequential front-end -1 -1 # dimensions: m k ?n # parameters: uploc transa 1 # syr2k -1 # test sequential front-end -1 -1 # dimensions: m k ?nn # parameters: uploc transa transb 1 # trmm -1 # test sequential front-end -1 -1 # dimensions: m n ??n? # parameters: side uploa transa diaga 0 # trmm3 -1 # test sequential front-end -1 -1 # dimensions: m n ??n?n # parameters: side uploa transa diaga transb 1 # trsm -1 # test sequential front-end -1 -1 # dimensions: m n ??n? # parameters: side uploa transa diaga diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c index 2cca94caa..b49783231 100644 --- a/testsuite/src/test_addm.c +++ b/testsuite/src/test_addm.c @@ -104,17 +104,17 @@ void libblis_test_addm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_addm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_addv.c b/testsuite/src/test_addv.c index 048af87a3..97afcc792 100644 --- a/testsuite/src/test_addv.c +++ b/testsuite/src/test_addv.c @@ -103,17 +103,17 @@ void libblis_test_addv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_addv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c index 5befee328..7ce0ef506 100644 --- a/testsuite/src/test_amaxv.c +++ b/testsuite/src/test_amaxv.c @@ -107,17 +107,17 @@ void libblis_test_amaxv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_amaxv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_axpbyv.c b/testsuite/src/test_axpbyv.c index 737384c0a..460909eb5 100644 --- a/testsuite/src/test_axpbyv.c +++ b/testsuite/src/test_axpbyv.c @@ -114,17 +114,17 @@ void libblis_test_axpbyv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_axpbyv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index 6319cc29d..4f1f2c8c6 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -114,17 +114,17 @@ void libblis_test_axpy2v { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1f_over == DISABLE_ALL ) return; + libblis_test_l1f_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_axpy2v_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 61397ab3d..e7c7ad69f 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -112,17 +112,17 @@ void libblis_test_axpyf { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1f_over == DISABLE_ALL ) return; + libblis_test_l1f_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_axpyf_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c index 04992a0de..53250106e 100644 --- a/testsuite/src/test_axpym.c +++ b/testsuite/src/test_axpym.c @@ -109,17 +109,17 @@ void libblis_test_axpym { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_axpym_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c index e616c3350..cb3415692 100644 --- a/testsuite/src/test_axpyv.c +++ b/testsuite/src/test_axpyv.c @@ -109,17 +109,17 @@ void libblis_test_axpyv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_axpyv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_copym.c b/testsuite/src/test_copym.c index 166e2efe8..859f34fda 100644 --- a/testsuite/src/test_copym.c +++ b/testsuite/src/test_copym.c @@ -103,17 +103,17 @@ void libblis_test_copym { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_copym_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_copyv.c b/testsuite/src/test_copyv.c index c9e77e6dc..a7f0b9aec 100644 --- a/testsuite/src/test_copyv.c +++ b/testsuite/src/test_copyv.c @@ -103,17 +103,17 @@ void libblis_test_copyv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_copyv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index 41a6cea89..26c1e0288 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -116,17 +116,17 @@ void libblis_test_dotaxpyv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1f_over == DISABLE_ALL ) return; + libblis_test_l1f_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_dotaxpyv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c index f6a177a42..79368ad3e 100644 --- a/testsuite/src/test_dotv.c +++ b/testsuite/src/test_dotv.c @@ -105,17 +105,17 @@ void libblis_test_dotv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_dotv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index e9160af37..85b819b79 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -122,17 +122,17 @@ void libblis_test_dotxaxpyf { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1f_over == DISABLE_ALL ) return; + libblis_test_l1f_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_dotxaxpyf_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index ff3032a55..c6a1d0977 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -114,17 +114,17 @@ void libblis_test_dotxf { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1f_over == DISABLE_ALL ) return; + libblis_test_l1f_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_dotxf_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c index 796574220..82d876190 100644 --- a/testsuite/src/test_dotxv.c +++ b/testsuite/src/test_dotxv.c @@ -110,17 +110,17 @@ void libblis_test_dotxv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_dotxv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index e692a54e4..061f0f825 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -116,17 +116,17 @@ void libblis_test_gemm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_gemm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index c86cfcafe..df79e15a9 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -117,17 +117,17 @@ void libblis_test_gemm_ukr { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3ukr_over == DISABLE_ALL ) return; + libblis_test_l3ukr_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_gemm_ukr_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index a24fdf896..db142487f 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -132,17 +132,17 @@ void libblis_test_gemmtrsm_ukr { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3ukr_over == DISABLE_ALL ) return; + libblis_test_l3ukr_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_gemmtrsm_ukr_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c index ac2cf9b69..a7be2860d 100644 --- a/testsuite/src/test_gemv.c +++ b/testsuite/src/test_gemv.c @@ -113,17 +113,17 @@ void libblis_test_gemv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_gemv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c index 35054793f..220d1dcf9 100644 --- a/testsuite/src/test_ger.c +++ b/testsuite/src/test_ger.c @@ -111,17 +111,17 @@ void libblis_test_ger { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_ger_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c index b2265a468..535450262 100644 --- a/testsuite/src/test_hemm.c +++ b/testsuite/src/test_hemm.c @@ -119,17 +119,17 @@ void libblis_test_hemm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_hemm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c index a4ddefda1..0cae6044d 100644 --- a/testsuite/src/test_hemv.c +++ b/testsuite/src/test_hemv.c @@ -114,17 +114,17 @@ void libblis_test_hemv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_hemv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c index 06e258925..c0e857387 100644 --- a/testsuite/src/test_her.c +++ b/testsuite/src/test_her.c @@ -111,17 +111,17 @@ void libblis_test_her { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_her_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c index c0210c0b9..827a723ce 100644 --- a/testsuite/src/test_her2.c +++ b/testsuite/src/test_her2.c @@ -113,17 +113,17 @@ void libblis_test_her2 { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_her2_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c index d79f56698..b708559cb 100644 --- a/testsuite/src/test_her2k.c +++ b/testsuite/src/test_her2k.c @@ -117,17 +117,17 @@ void libblis_test_her2k { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_her2k_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c index 1db6dc113..8b3bb74b6 100644 --- a/testsuite/src/test_herk.c +++ b/testsuite/src/test_herk.c @@ -115,17 +115,17 @@ void libblis_test_herk { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_herk_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 20a69254d..419fadbcb 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -498,10 +498,6 @@ void libblis_test_read_op_info( test_ops_t* ops, ops->indiv_over = TRUE; } - // Read the line for the sequential front-end/micro-kernel interface. - libblis_test_read_next_line( buffer, input_stream ); - sscanf( buffer, "%d ", &(op->front_seq) ); - op->n_dims = libblis_test_get_n_dims_from_dimset( dimset ); op->dimset = dimset; @@ -569,12 +565,6 @@ void libblis_test_read_op_info( test_ops_t* ops, // Initialize the parent pointer. op->ops = ops; - - // Disable operation if requested. - if ( op->op_switch == DISABLE_ALL ) - { - op->front_seq = DISABLE; - } } @@ -975,9 +965,7 @@ void libblis_test_output_op_struct( FILE* os, test_op_t* op, char* op_str ) { dimset_t dimset = op->dimset; - libblis_test_fprintf_c( os, "test %s seq front-end? %d\n", op_str, op->front_seq ); - - if ( dimset == BLIS_TEST_DIMS_MNK ) + if ( dimset == BLIS_TEST_DIMS_MNK ) { libblis_test_fprintf_c( os, "%s m n k %d %d %d\n", op_str, op->dim_spec[0], op->dim_spec[1], op->dim_spec[2] ); @@ -2434,7 +2422,7 @@ int libblis_test_op_is_disabled( test_op_t* op ) // If there was at least one individual override, then an op test is // disabled if it is NOT equal to ENABLE_ONLY. If there were no // individual overrides, then an op test is disabled if it is equal - // to DISABLE_ALL. + // to DISABLE. if ( op->ops->indiv_over == TRUE ) { if ( op->op_switch != ENABLE_ONLY ) r_val = TRUE; @@ -2442,9 +2430,56 @@ int libblis_test_op_is_disabled( test_op_t* op ) } else // if ( op->ops->indiv_over == FALSE ) { - if ( op->op_switch == DISABLE_ALL ) r_val = TRUE; - else r_val = FALSE; + if ( op->op_switch == DISABLE ) r_val = TRUE; + else r_val = FALSE; } return r_val; } + +int libblis_test_op_is_done( test_op_t* op ) +{ + return op->test_done; +} + +int libblis_test_util_is_disabled( test_op_t* op ) +{ + if ( op->ops->util_over == DISABLE ) return TRUE; + else return FALSE; +} + +int libblis_test_l1v_is_disabled( test_op_t* op ) +{ + if ( op->ops->l1v_over == DISABLE ) return TRUE; + else return FALSE; +} + +int libblis_test_l1m_is_disabled( test_op_t* op ) +{ + if ( op->ops->l1m_over == DISABLE ) return TRUE; + else return FALSE; +} + +int libblis_test_l1f_is_disabled( test_op_t* op ) +{ + if ( op->ops->l1f_over == DISABLE ) return TRUE; + else return FALSE; +} + +int libblis_test_l2_is_disabled( test_op_t* op ) +{ + if ( op->ops->l2_over == DISABLE ) return TRUE; + else return FALSE; +} + +int libblis_test_l3ukr_is_disabled( test_op_t* op ) +{ + if ( op->ops->l3ukr_over == DISABLE ) return TRUE; + else return FALSE; +} + +int libblis_test_l3_is_disabled( test_op_t* op ) +{ + if ( op->ops->l3_over == DISABLE ) return TRUE; + else return FALSE; +} diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 69b51e333..ac49f8da1 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -86,8 +86,6 @@ #define SECONDS_TO_SLEEP 3 -#define DISABLE_ALL 0 -#define SPECIFY 1 #define DISABLE 0 #define ENABLE 1 #define ENABLE_ONLY 2 @@ -187,7 +185,9 @@ typedef struct opid_t opid; int op_switch; +#if 0 int front_seq; +#endif unsigned int n_dims; dimset_t dimset; int dim_spec[ MAX_NUM_DIMENSIONS ]; @@ -430,6 +430,14 @@ void libblis_test_parse_command_line( int argc, char** argv ); void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid ); int libblis_test_op_is_disabled( test_op_t* op ); +int libblis_test_op_is_done( test_op_t* op ); +int libblis_test_util_is_disabled( test_op_t* op ); +int libblis_test_l1v_is_disabled( test_op_t* op ); +int libblis_test_l1m_is_disabled( test_op_t* op ); +int libblis_test_l1f_is_disabled( test_op_t* op ); +int libblis_test_l2_is_disabled( test_op_t* op ); +int libblis_test_l3ukr_is_disabled( test_op_t* op ); +int libblis_test_l3_is_disabled( test_op_t* op ); // // --- Test module headers ----------------------------------------------------- diff --git a/testsuite/src/test_normfm.c b/testsuite/src/test_normfm.c index 1bee0756e..ba11f015c 100644 --- a/testsuite/src/test_normfm.c +++ b/testsuite/src/test_normfm.c @@ -102,17 +102,17 @@ void libblis_test_normfm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_normfm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_normfv.c b/testsuite/src/test_normfv.c index c5224cd06..7fb05e29d 100644 --- a/testsuite/src/test_normfv.c +++ b/testsuite/src/test_normfv.c @@ -102,17 +102,17 @@ void libblis_test_normfv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_normfv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c index 9eda02a34..8cb51e872 100644 --- a/testsuite/src/test_randm.c +++ b/testsuite/src/test_randm.c @@ -99,17 +99,17 @@ void libblis_test_randm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->util_over == DISABLE_ALL ) return; + libblis_test_util_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_randm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_randv.c b/testsuite/src/test_randv.c index 7ac693c9a..a7f10947e 100644 --- a/testsuite/src/test_randv.c +++ b/testsuite/src/test_randv.c @@ -99,17 +99,17 @@ void libblis_test_randv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->util_over == DISABLE_ALL ) return; + libblis_test_util_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_randv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c index dfbbdb64a..06a8ff0ab 100644 --- a/testsuite/src/test_scal2m.c +++ b/testsuite/src/test_scal2m.c @@ -108,17 +108,17 @@ void libblis_test_scal2m { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_scal2m_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c index 9ab18d317..379c2179b 100644 --- a/testsuite/src/test_scal2v.c +++ b/testsuite/src/test_scal2v.c @@ -108,17 +108,17 @@ void libblis_test_scal2v { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_scal2v_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c index 2d6f53e80..f4c73bc47 100644 --- a/testsuite/src/test_scalm.c +++ b/testsuite/src/test_scalm.c @@ -104,17 +104,17 @@ void libblis_test_scalm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_scalm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c index b019cd3c7..045ca0f2c 100644 --- a/testsuite/src/test_scalv.c +++ b/testsuite/src/test_scalv.c @@ -105,17 +105,17 @@ void libblis_test_scalv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_scalv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_setm.c b/testsuite/src/test_setm.c index 781ec4aa5..de1bd3636 100644 --- a/testsuite/src/test_setm.c +++ b/testsuite/src/test_setm.c @@ -101,17 +101,17 @@ void libblis_test_setm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_setm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_setv.c b/testsuite/src/test_setv.c index 456aca6cb..912a4885f 100644 --- a/testsuite/src/test_setv.c +++ b/testsuite/src/test_setv.c @@ -101,17 +101,17 @@ void libblis_test_setv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_setv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c index 950c1fc18..9821329fd 100644 --- a/testsuite/src/test_subm.c +++ b/testsuite/src/test_subm.c @@ -104,17 +104,17 @@ void libblis_test_subm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1m_over == DISABLE_ALL ) return; + libblis_test_l1m_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_subm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_subv.c b/testsuite/src/test_subv.c index 0189d321f..1c1152ae5 100644 --- a/testsuite/src/test_subv.c +++ b/testsuite/src/test_subv.c @@ -104,17 +104,17 @@ void libblis_test_subv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_subv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c index 0ba63d82a..04b8ffe82 100644 --- a/testsuite/src/test_symm.c +++ b/testsuite/src/test_symm.c @@ -119,17 +119,17 @@ void libblis_test_symm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_symm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c index 4d5e0f386..99343cff6 100644 --- a/testsuite/src/test_symv.c +++ b/testsuite/src/test_symv.c @@ -114,17 +114,17 @@ void libblis_test_symv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_symv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c index dbc1f4d26..c8f496d1f 100644 --- a/testsuite/src/test_syr.c +++ b/testsuite/src/test_syr.c @@ -111,17 +111,17 @@ void libblis_test_syr { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_syr_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c index 9edad1b69..9ee68db9b 100644 --- a/testsuite/src/test_syr2.c +++ b/testsuite/src/test_syr2.c @@ -113,17 +113,17 @@ void libblis_test_syr2 { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_syr2_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c index fcf837575..0ff10cdc2 100644 --- a/testsuite/src/test_syr2k.c +++ b/testsuite/src/test_syr2k.c @@ -117,17 +117,17 @@ void libblis_test_syr2k { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_syr2k_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c index 8b1ecc535..3cd5e2c48 100644 --- a/testsuite/src/test_syrk.c +++ b/testsuite/src/test_syrk.c @@ -115,17 +115,17 @@ void libblis_test_syrk { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_syrk_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index e13ad08a6..e10237ef0 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -115,17 +115,17 @@ void libblis_test_trmm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_trmm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c index 162731bf6..741678c1c 100644 --- a/testsuite/src/test_trmm3.c +++ b/testsuite/src/test_trmm3.c @@ -119,17 +119,17 @@ void libblis_test_trmm3 { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_trmm3_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c index 75a3d1acf..5030ef37c 100644 --- a/testsuite/src/test_trmv.c +++ b/testsuite/src/test_trmv.c @@ -110,17 +110,17 @@ void libblis_test_trmv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_trmv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c index d4debc516..39860fa6d 100644 --- a/testsuite/src/test_trsm.c +++ b/testsuite/src/test_trsm.c @@ -115,17 +115,17 @@ void libblis_test_trsm { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3_over == DISABLE_ALL ) return; + libblis_test_l3_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_trsm_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 5bbec8dd0..e4f6edb75 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -115,17 +115,17 @@ void libblis_test_trsm_ukr { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l3ukr_over == DISABLE_ALL ) return; + libblis_test_l3ukr_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_trsm_ukr_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c index 0872af5b9..11255d5d1 100644 --- a/testsuite/src/test_trsv.c +++ b/testsuite/src/test_trsv.c @@ -110,17 +110,17 @@ void libblis_test_trsv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l2_over == DISABLE_ALL ) return; + libblis_test_l2_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_trsv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, diff --git a/testsuite/src/test_xpbyv.c b/testsuite/src/test_xpbyv.c index 34129a5fd..3578f51f8 100644 --- a/testsuite/src/test_xpbyv.c +++ b/testsuite/src/test_xpbyv.c @@ -108,17 +108,17 @@ void libblis_test_xpbyv { // Return early if this test has already been done. - if ( op->test_done == TRUE ) return; + if ( libblis_test_op_is_done( op ) ) return; // Return early if operation is disabled. if ( libblis_test_op_is_disabled( op ) || - op->ops->l1v_over == DISABLE_ALL ) return; + libblis_test_l1v_is_disabled( op ) ) return; // Call dependencies first. if ( TRUE ) libblis_test_xpbyv_deps( params, op ); // Execute the test driver for each implementation requested. - if ( op->front_seq == ENABLE ) + //if ( op->front_seq == ENABLE ) { libblis_test_op_driver( params, op, From 96d2774b4cb44ff1e8b5798d7cfc83154a607624 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 5 Jun 2018 14:17:39 +0200 Subject: [PATCH 13/37] Make bli_auxinfo_next_b() return b_next, not a_next (#216) --- frame/base/bli_auxinfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h index 29facd2ae..3fbc8dbdb 100644 --- a/frame/base/bli_auxinfo.h +++ b/frame/base/bli_auxinfo.h @@ -53,7 +53,7 @@ static void* bli_auxinfo_next_a( auxinfo_t* ai ) } static void* bli_auxinfo_next_b( auxinfo_t* ai ) { - return ai->a_next; + return ai->b_next; } static inc_t bli_auxinfo_is_a( auxinfo_t* ai ) From 1b9af85ec98d91bb2b27aadaa3df344d18faff35 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 5 Jun 2018 16:07:13 -0500 Subject: [PATCH 14/37] Updated ref99 call to _cntx_set_thrloop_from_env(). Details: - Reordered the arguments in the ref99 sandbox's call to bli_cntx_set_thrloop_from_env() to be consistent with the updated function signature from f97a86f. Thanks to Devangi Parikh for reporting this issue. --- sandbox/ref99/blx_gemm_front.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sandbox/ref99/blx_gemm_front.c b/sandbox/ref99/blx_gemm_front.c index 2010011d4..13615109b 100644 --- a/sandbox/ref99/blx_gemm_front.c +++ b/sandbox/ref99/blx_gemm_front.c @@ -97,10 +97,15 @@ void blx_gemm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); // Invoke the internal back-end via the thread handler. blx_gemm_thread From 3f48c38164b4135515b5c752c506fdccc4480be2 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 5 Jun 2018 16:52:35 -0500 Subject: [PATCH 15/37] Cosmetic fix to configure output in config.mk. Details: - Fixed configure so that MK_ENABLE_MEMKIND is assigned "no" when the option is disabled due to libmemkind not being present. This wasn't affecting anything since the one use of the variable (in common.mk) was formulated as "ifeq ($(MK_ENABLE_MEMKIND),yes)". That is, the variable being empty was effectively equivalent to it being set to "no". - Comment updates to build/config.mk.in, common.mk. --- build/config.mk.in | 5 +++-- common.mk | 6 ++++-- configure | 6 ++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/build/config.mk.in b/build/config.mk.in index ed0485e16..8c2dced21 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -118,8 +118,9 @@ MK_ENABLE_CBLAS := @enable_cblas@ # Whether libblis will depend on libmemkind for certain memory allocations. MK_ENABLE_MEMKIND := @enable_memkind@ -# Whether an alternative gemm implementation will be compiled and included -# in BLIS. +# The name of a sandbox defining an alternative gemm implementation. If empty, +# no sandbox will be used and the conventional gemm implementation will remain +# enabled. SANDBOX := @sandbox@ # end of ifndef CONFIG_MK_INCLUDED conditional block diff --git a/common.mk b/common.mk index 955adbd30..3104b8ba8 100644 --- a/common.mk +++ b/common.mk @@ -648,7 +648,10 @@ PARENT_PATH := $(DIST_PATH) # -- sandbox -- # Construct paths to each sandbox. (At present, there can be only one.) -SANDBOX_PATHS := := $(addprefix $(SANDBOX_PATH)/, $(SANDBOX)) +# NOTE: If $(SANDBOX) is empty (because no sandbox was enabled at configure- +# time) then $(SANDBOX_PATHS) will also be empty, which will cause no +# fragments to be included. +SANDBOX_PATHS := $(addprefix $(SANDBOX_PATH)/, $(SANDBOX)) # This variable is used by the include statements as they recursively include # one another. For the 'sandbox' directory, we initialize it to that directory @@ -658,7 +661,6 @@ PARENT_PATH := $(DIST_PATH)/$(SANDBOX_DIR) # Recursively include the makefile fragments in the sandbox sub-directory. -include $(addsuffix /$(FRAGMENT_MK), $(SANDBOX_PATHS)) - # Create a list of the makefile fragments using the variable into which each # of the above include statements accumulated their directory paths. MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS)) diff --git a/configure b/configure index 8894835b3..69365cad0 100755 --- a/configure +++ b/configure @@ -2301,9 +2301,10 @@ main() fi if [ "x${has_memkind}" = "xyes" ]; then # If no explicit option was given for libmemkind one way or the other, - # default to the value previously returned by has_libmemkind(). + # we use the value returned previously by has_libmemkind() to determine + # the default. if [ "x${enable_memkind}" = "x" ]; then - enable_memkind=${has_memkind} + enable_memkind="yes" fi echo "${script_name}: libmemkind found; default is to enable use." if [ "x${enable_memkind}" = "xyes" ]; then @@ -2318,6 +2319,7 @@ main() if [ "x${enable_memkind}" = "xyes" ]; then echo "${script_name}: cannot honor explicit request to enable libmemkind." fi + enable_memkind="no" enable_memkind_01=0 fi if [ "x${enable_blas}" = "xyes" ]; then From 3df39b37a0134befa34b6b6259db98467c7bc965 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 6 Jun 2018 15:35:05 -0500 Subject: [PATCH 16/37] Fixed recently broken input.operations.fast. Details: - Removed "test sequential front-end" lines from microkernel test entries of input.operations.fast. This change was meant for inclusion in bd02c4e but was missed due to slightly different wording of the comment (I used "sed //d" to remove the lines). This fixes the broken 'make checkblis-fast' (and 'make check') targets. --- testsuite/input.operations.fast | 3 --- 1 file changed, 3 deletions(-) diff --git a/testsuite/input.operations.fast b/testsuite/input.operations.fast index d86de6ecc..d2a44276e 100644 --- a/testsuite/input.operations.fast +++ b/testsuite/input.operations.fast @@ -260,15 +260,12 @@ # --- Level-3 micro-kernels ------------------------------------------------ 1 # gemm -1 # test sequential micro-kernel -1 # dimensions: k 1 # trsm -1 # test sequential micro-kernel ? # parameters: uploa 1 # gemmtrsm -1 # test sequential micro-kernel -1 # dimensions: k ? # parameters: uploa From 0a4a27e1a4487480410bc0b1bb034bcf97583214 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 6 Jun 2018 19:02:29 -0500 Subject: [PATCH 17/37] Defined/implemented bli_projm(). Details: - Defined a new operation in frame/base/bli_proj.c, bli_projm(), which behaves like bli_copym(), except that operands a and b are allowed to contain data of differing domains (e.g. a is real while b is complex, or vice versa). The file is named bli_proj.c, rather than bli_projm.c, with the intention that a 'v' vector version of the function may be added to the same file (at some point in the future). - Added supporting bli_check_*() functions in bli_check.c to confirm consistent precisions between to datatypes/objects, as well as the appropriate error message in bli_error.c and a new error code in bli_type_defs.h. - Wrote a bli_projm_check() function to go along with bli_projm(). - Defined static function bli_obj_real_part() in bli_obj_macro_defs.h, which will initialize an obj_t alias to the real part of the source object. - Fixed a bug in the static function bli_dt_proj_to_complex(), found in bli_param_macro_defs.h. Thankfully, there were no calls to the function to produce buggy behavior. --- frame/base/bli_check.c | 34 +++++++++++ frame/base/bli_check.h | 2 + frame/base/bli_error.c | 2 + frame/base/bli_proj.c | 87 ++++++++++++++++++++++++++++ frame/base/bli_proj.h | 41 +++++++++++++ frame/base/check/bli_proj_check.c | 75 ++++++++++++++++++++++++ frame/base/check/bli_proj_check.h | 39 +++++++++++++ frame/include/bli_obj_macro_defs.h | 26 ++++++++- frame/include/bli_param_macro_defs.h | 2 +- frame/include/bli_type_defs.h | 1 + frame/include/blis.h | 1 + 11 files changed, 306 insertions(+), 4 deletions(-) create mode 100644 frame/base/bli_proj.c create mode 100644 frame/base/bli_proj.h create mode 100644 frame/base/check/bli_proj_check.c create mode 100644 frame/base/check/bli_proj_check.h diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c index 4c9993b94..3e42758ba 100644 --- a/frame/base/bli_check.c +++ b/frame/base/bli_check.c @@ -342,6 +342,40 @@ err_t bli_check_real_valued_object( obj_t* a ) return e_val; } +err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ) +{ + err_t e_val = BLIS_SUCCESS; + + if ( dt_a == BLIS_FLOAT ) + { + if ( dt_b != BLIS_FLOAT && + dt_b != BLIS_SCOMPLEX ) + e_val = BLIS_INCONSISTENT_PRECISIONS; + } + else if ( dt_a == BLIS_DOUBLE ) + { + if ( dt_b != BLIS_DOUBLE && + dt_b != BLIS_DCOMPLEX ) + e_val = BLIS_INCONSISTENT_PRECISIONS; + } + + return e_val; +} + +err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ) +{ + err_t e_val; + num_t dt_a; + num_t dt_b; + + dt_a = bli_obj_dt( a ); + dt_b = bli_obj_dt( b ); + + e_val = bli_check_consistent_precisions( dt_a, dt_b ); + + return e_val; +} + // -- Dimension-related checks ------------------------------------------------- err_t bli_check_conformal_dims( obj_t* a, obj_t* b ) diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h index bd5cd064f..dd76054e7 100644 --- a/frame/base/bli_check.h +++ b/frame/base/bli_check.h @@ -62,6 +62,8 @@ err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); +err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); +err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index d78c48387..710e34028 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -90,6 +90,8 @@ void bli_error_init_msgs( void ) "Expected second datatype to be real projection of first." ); sprintf( bli_error_string_for_code(BLIS_EXPECTED_REAL_VALUED_OBJECT), "Expected real-valued object (ie: if complex, imaginary component equals zero)." ); + sprintf( bli_error_string_for_code(BLIS_INCONSISTENT_PRECISIONS), + "Expected consistent precisions (both single or both double)." ); sprintf( bli_error_string_for_code(BLIS_NONCONFORMAL_DIMENSIONS), "Encountered non-conformal dimensions between objects." ); diff --git a/frame/base/bli_proj.c b/frame/base/bli_proj.c new file mode 100644 index 000000000..4eb85e07b --- /dev/null +++ b/frame/base/bli_proj.c @@ -0,0 +1,87 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_projm + ( + obj_t* a, + obj_t* b + ) +{ + obj_t* a2; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_projm_check( a, b ); + + if ( ( bli_obj_is_real( a ) && bli_obj_is_real( b ) ) || + ( bli_obj_is_complex( a ) && bli_obj_is_complex( b ) ) ) + { + // If a and b are both real or both complex, we can simply use + // copym. + bli_copym( a, b ); + } + else + { + // This branch handles the case where one operand is real and + // the other is complex. + + if ( bli_obj_is_real( a ) /* && bli_obj_is_complex( b ) */ ) + { + // If a is real and b is complex, we must obtain the real part + // of b so that we can copy a into the real part (after + // initializing all of b, including imaginary components, to + // zero). + + obj_t br; + + bli_obj_real_part( b, &br ); + + bli_setm( &BLIS_ZERO, b ); + bli_copym( a, &br ); + } + else // bli_obj_is_complex( a ) && bli_obj_is_real( b ) + { + // If a is complex and b is real, we can simply copy the + // real part of a into b. + + obj_t ar; + + bli_obj_real_part( a, &ar ); + + bli_copym( &ar, b ); + } + } +} diff --git a/frame/base/bli_proj.h b/frame/base/bli_proj.h new file mode 100644 index 000000000..7df26b023 --- /dev/null +++ b/frame/base/bli_proj.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_proj_check.h" + +void bli_projm + ( + obj_t* a, + obj_t* b + ); diff --git a/frame/base/check/bli_proj_check.c b/frame/base/check/bli_proj_check.c new file mode 100644 index 000000000..aca546102 --- /dev/null +++ b/frame/base/check/bli_proj_check.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_projm_check + ( + obj_t* a, + obj_t* b + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_precisions( a, b ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_conformal_dims( a, b ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); +} + diff --git a/frame/base/check/bli_proj_check.h b/frame/base/check/bli_proj_check.h new file mode 100644 index 000000000..7ce2274ad --- /dev/null +++ b/frame/base/check/bli_proj_check.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_projm_check + ( + obj_t* a, + obj_t* b + ); diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index cf5aa550a..aaa939ea0 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -909,14 +909,34 @@ static void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) } } -// Make a full alias (shallow copy) +// Make a full alias (shallow copy). static void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } -// Check if two objects are aliases of one another +// Alias only the real part. + +static void bli_obj_real_part( obj_t* c, obj_t* r ) +{ + bli_obj_alias_to( c, r ); + + // Change the datatype. + num_t dt_r = bli_obj_dt_proj_to_real( c ); + bli_obj_set_dt( dt_r, r ); + + // Update the element size. + siz_t es_c = bli_obj_elem_size( c ); + bli_obj_set_elem_size( es_c/2, r ); + + // Update the strides. + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); +} + +// Check if two objects are aliases of one another. static bool_t bli_obj_is_alias_of( obj_t* a, obj_t* b ) { @@ -941,7 +961,7 @@ static void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) bli_obj_apply_conj( conja, b ); } -// Initialize object with default properties (info field) +// Initialize object with default properties (info field). static void bli_obj_set_defaults( obj_t* obj ) { diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 09cd90772..ee61b5728 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -119,7 +119,7 @@ static num_t bli_dt_proj_to_real( num_t dt ) static num_t bli_dt_proj_to_complex( num_t dt ) { - return ( dt & BLIS_BITVAL_COMPLEX ); + return ( dt | BLIS_BITVAL_COMPLEX ); } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index b1a1f55b6..a097ddfc8 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1177,6 +1177,7 @@ typedef enum BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), + BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), diff --git a/frame/include/blis.h b/frame/include/blis.h index 55ab9316a..3e82f40a7 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -122,6 +122,7 @@ extern "C" { #include "bli_cpuid.h" #include "bli_string.h" #include "bli_setgetij.h" +#include "bli_proj.h" // -- Level-0 operations -- From b5a641e968469805906eb2c971384d12ad1beac5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 6 Jun 2018 19:05:37 -0500 Subject: [PATCH 18/37] Added char-to-dt and dt-to-char mapping functions. Details: - Defined additional functions in bli_param_map.c: bli_param_map_char_to_blis_dt() bli_param_map_blis_to_char_dt() which will map a char to its corresponding num_t, or vice versa. --- frame/base/bli_param_map.c | 26 ++++++++++++++++++++++++++ frame/base/bli_param_map.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/frame/base/bli_param_map.c b/frame/base/bli_param_map.c index b50f5010b..a2d90011e 100644 --- a/frame/base/bli_param_map.c +++ b/frame/base/bli_param_map.c @@ -210,6 +210,19 @@ void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ) } } +void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ) +{ + if ( dt == 's' ) *blis_dt = BLIS_FLOAT; + else if ( dt == 'd' ) *blis_dt = BLIS_DOUBLE; + else if ( dt == 'c' ) *blis_dt = BLIS_SCOMPLEX; + else if ( dt == 'z' ) *blis_dt = BLIS_DCOMPLEX; + else if ( dt == 'i' ) *blis_dt = BLIS_INT; + else + { + bli_check_error_code( BLIS_INVALID_DATATYPE ); + } +} + // --- BLIS to BLIS char mappings ---------------------------------------------- @@ -265,3 +278,16 @@ void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ) } } +void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ) +{ + if ( blis_dt == BLIS_FLOAT ) *dt = 's'; + else if ( blis_dt == BLIS_DOUBLE ) *dt = 'd'; + else if ( blis_dt == BLIS_SCOMPLEX ) *dt = 'c'; + else if ( blis_dt == BLIS_DCOMPLEX ) *dt = 'z'; + else if ( blis_dt == BLIS_INT ) *dt = 'i'; + else + { + bli_check_error_code( BLIS_INVALID_DATATYPE ); + } +} + diff --git a/frame/base/bli_param_map.h b/frame/base/bli_param_map.h index 75738cd62..6cae9ee7b 100644 --- a/frame/base/bli_param_map.h +++ b/frame/base/bli_param_map.h @@ -57,6 +57,7 @@ void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); +void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- @@ -66,4 +67,5 @@ void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); +void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); From 513138b1a1ecebd015580423c779810cae5c67f2 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 7 Jun 2018 12:24:47 -0500 Subject: [PATCH 19/37] Defined/implemented bli_projv(). Details: - Added an implementation for bli_projv() to go along with the implementation of bli_projm() added in 0a4a27e. The only difference between the two is that bli_projv() may only be used on vectors, whereas bli_projm() is general-purpose. - Added a _check() function corresponding to bli_projv(). --- frame/base/bli_proj.c | 52 +++++++++++++++++++++++++++++-- frame/base/bli_proj.h | 6 ++++ frame/base/check/bli_proj_check.c | 39 +++++++++++++++++++++++ frame/base/check/bli_proj_check.h | 6 ++++ 4 files changed, 101 insertions(+), 2 deletions(-) diff --git a/frame/base/bli_proj.c b/frame/base/bli_proj.c index 4eb85e07b..5a09a2c51 100644 --- a/frame/base/bli_proj.c +++ b/frame/base/bli_proj.c @@ -40,8 +40,6 @@ void bli_projm obj_t* b ) { - obj_t* a2; - // Check parameters. if ( bli_error_checking_is_enabled() ) bli_projm_check( a, b ); @@ -85,3 +83,53 @@ void bli_projm } } } + +void bli_projv + ( + obj_t* x, + obj_t* y + ) +{ + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_projv_check( x, y ); + + if ( ( bli_obj_is_real( x ) && bli_obj_is_real( y ) ) || + ( bli_obj_is_complex( x ) && bli_obj_is_complex( y ) ) ) + { + // If x and y are both real or both complex, we can simply use + // copyv. + bli_copyv( x, y ); + } + else + { + // This branch handles the case where one operand is real and + // the other is complex. + + if ( bli_obj_is_real( x ) /* && bli_obj_is_complex( y ) */ ) + { + // If x is real and y is complex, we must obtain the real part + // of y so that we can copy x into the real part (after + // initializing all of y, including imaginary components, to + // zero). + + obj_t yr; + + bli_obj_real_part( y, &yr ); + + bli_setv( &BLIS_ZERO, y ); + bli_copyv( x, &yr ); + } + else // bli_obj_is_complex( x ) && bli_obj_is_real( y ) + { + // If x is complex and y is real, we can simply copy the + // real part of x into y. + + obj_t xr; + + bli_obj_real_part( x, &xr ); + + bli_copyv( &xr, y ); + } + } +} diff --git a/frame/base/bli_proj.h b/frame/base/bli_proj.h index 7df26b023..39e02f7be 100644 --- a/frame/base/bli_proj.h +++ b/frame/base/bli_proj.h @@ -39,3 +39,9 @@ void bli_projm obj_t* a, obj_t* b ); + +void bli_projv + ( + obj_t* x, + obj_t* y + ); diff --git a/frame/base/check/bli_proj_check.c b/frame/base/check/bli_proj_check.c index aca546102..f030cc497 100644 --- a/frame/base/check/bli_proj_check.c +++ b/frame/base/check/bli_proj_check.c @@ -73,3 +73,42 @@ void bli_projm_check bli_check_error_code( e_val ); } +void bli_projv_check + ( + obj_t* x, + obj_t* y + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( y ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_precisions( x, y ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_vector_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_object( y ); + bli_check_error_code( e_val ); + + e_val = bli_check_equal_vector_lengths( x, y ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( y ); + bli_check_error_code( e_val ); +} + diff --git a/frame/base/check/bli_proj_check.h b/frame/base/check/bli_proj_check.h index 7ce2274ad..d20aad9d6 100644 --- a/frame/base/check/bli_proj_check.h +++ b/frame/base/check/bli_proj_check.h @@ -37,3 +37,9 @@ void bli_projm_check obj_t* a, obj_t* b ); + +void bli_projv_check + ( + obj_t* x, + obj_t* y + ); From 55b6abdf7458e31df3ad01796d67c2332c776948 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 7 Jun 2018 14:08:12 -0500 Subject: [PATCH 20/37] Enforce consistent datatypes in most object APIs. Details: - Added logic to level-1v, -1d, -1f, -1m, -2, and -3 operations' _check() functions to ensure that all operands are of the same datatype. There are some exceptions that were left out, such as the _check() function for the various norm operations since they have a different idea of datatype consistency (ie: the norm object must be the real projection of the primary input vector/matrix object). --- frame/1/bli_l1v_check.c | 25 +++++++++++++++ frame/1d/bli_l1d_check.c | 10 ++++++ frame/1f/bli_l1f_check.c | 52 ++++++++++++++++++++++++++++++ frame/1m/bli_l1m_check.c | 10 ++++++ frame/2/bli_l2_check.c | 68 ++++++++++++++++++++++++++++++++++++++++ frame/3/bli_l3_check.c | 38 ++++++++++++++++++++++ 6 files changed, 203 insertions(+) diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c index 54c856b45..7ca5e1291 100644 --- a/frame/1/bli_l1v_check.c +++ b/frame/1/bli_l1v_check.c @@ -203,6 +203,11 @@ void bli_l1v_xy_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_vector_object( x ); @@ -243,6 +248,11 @@ void bli_l1v_axy_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); @@ -289,6 +299,11 @@ void bli_l1v_xby_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( beta ); @@ -339,6 +354,11 @@ void bli_l1v_axby_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); @@ -402,6 +422,11 @@ void bli_l1v_dot_check e_val = bli_check_nonconstant_object( rho ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c index 3846d99ef..118908657 100644 --- a/frame/1d/bli_l1d_check.c +++ b/frame/1d/bli_l1d_check.c @@ -121,6 +121,11 @@ void bli_l1d_xy_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_matrix_object( x ); @@ -161,6 +166,11 @@ void bli_l1d_axy_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); diff --git a/frame/1f/bli_l1f_check.c b/frame/1f/bli_l1f_check.c index a64b9c7db..4ba9dc034 100644 --- a/frame/1f/bli_l1f_check.c +++ b/frame/1f/bli_l1f_check.c @@ -66,6 +66,14 @@ void bli_axpy2v_check e_val = bli_check_floating_object( z ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( x, z ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alphax ); @@ -132,6 +140,14 @@ void bli_axpyf_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); @@ -203,6 +219,17 @@ void bli_dotaxpyv_check e_val = bli_check_floating_object( z ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, xt ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( x, z ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); @@ -299,6 +326,23 @@ void bli_dotxaxpyf_check e_val = bli_check_floating_object( z ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, at ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, w ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, z ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); @@ -407,6 +451,14 @@ void bli_dotxf_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c index d2ae6c5c4..ad8c409a7 100644 --- a/frame/1m/bli_l1m_check.c +++ b/frame/1m/bli_l1m_check.c @@ -106,6 +106,11 @@ void bli_l1m_xy_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_matrix_object( x ); @@ -146,6 +151,11 @@ void bli_l1m_axy_check e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( x, y ); + bli_check_error_code( e_val ); + // Check object dimensions. e_val = bli_check_scalar_object( alpha ); diff --git a/frame/2/bli_l2_check.c b/frame/2/bli_l2_check.c index 84dda521f..d8f66ff87 100644 --- a/frame/2/bli_l2_check.c +++ b/frame/2/bli_l2_check.c @@ -53,6 +53,14 @@ void bli_gemv_check e_val = bli_check_general_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); } @@ -80,6 +88,14 @@ void bli_hemv_check e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); } @@ -107,6 +123,14 @@ void bli_symv_check e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); } @@ -132,6 +156,11 @@ void bli_trmv_check e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); } @@ -157,6 +186,11 @@ void bli_trsv_check e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); } @@ -178,6 +212,14 @@ void bli_ger_check e_val = bli_check_general_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); } @@ -203,6 +245,11 @@ void bli_her_check e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); } @@ -229,6 +276,14 @@ void bli_her2_check e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); } @@ -254,6 +309,11 @@ void bli_syr_check e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); } @@ -280,6 +340,14 @@ void bli_syr2_check e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( a, x ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( a, y ); + bli_check_error_code( e_val ); } diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 3dfd9bbf4..1a14ba93f 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -294,6 +294,14 @@ void bli_gemm_basic_check e_val = bli_check_level3_dims( a, b, c ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); } void bli_hemm_basic_check @@ -330,6 +338,14 @@ void bli_hemm_basic_check e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); } void bli_herk_basic_check @@ -365,6 +381,14 @@ void bli_herk_basic_check e_val = bli_check_general_object( ah ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, ah ); + bli_check_error_code( e_val ); } void bli_her2k_basic_check @@ -412,6 +436,20 @@ void bli_her2k_basic_check e_val = bli_check_general_object( ah ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, ah ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, bh ); + bli_check_error_code( e_val ); } void bli_l3_basic_check From b65d0b841b7e4357bc2cf743bbb03384a3ab0bfa Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 7 Jun 2018 14:38:41 -0500 Subject: [PATCH 21/37] Fixed bug in bli_dt_proj_to_complex(). Details: - Fixed a bug identical to the one fixed in 0a4a27e, except this time in the bli_obj_param_defs.h header file. It looks like the only consumers of this static function were in bli_l0_oapi.c, and so this may not have been manifesting (yet). --- frame/include/bli_obj_macro_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index aaa939ea0..7a415dedb 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -108,7 +108,7 @@ static num_t bli_obj_dt_proj_to_real( obj_t* obj ) static num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { - return ( bli_obj_dt( obj ) & BLIS_BITVAL_COMPLEX ); + return ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } static num_t bli_obj_target_dt( obj_t* obj ) From 65fae95074d239354737355bbe6f202d4f8b2871 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 7 Jun 2018 17:41:09 -0500 Subject: [PATCH 22/37] Implemented bli_setrm, _setim, _setrv, _setiv. Details: - Defined new wrappers to setm/setv operations in frame/base/bli_setri.c that will target only the real or only the imaginary parts of a matrix/vector object. - Updated bli_obj_real_part() so that the complex-specific portions of the function are not executed if the object is real. - Defined bli_obj_imag_part(). - Caveat: If bli_obj_imag_part() is called on a real object, it does nothing, leaving the destination object untouched. The caller must take care to only call the function on complex objects. - Reordered some of the static functions in bli_obj_macro_defs.h related to aliasing. --- frame/base/bli_setri.c | 162 +++++++++++++++++++++++++++++ frame/base/bli_setri.h | 62 +++++++++++ frame/include/bli_obj_macro_defs.h | 137 +++++++++++++++--------- frame/include/blis.h | 1 + 4 files changed, 310 insertions(+), 52 deletions(-) create mode 100644 frame/base/bli_setri.c create mode 100644 frame/base/bli_setri.h diff --git a/frame/base/bli_setri.c b/frame/base/bli_setri.c new file mode 100644 index 000000000..054ea3d9b --- /dev/null +++ b/frame/base/bli_setri.c @@ -0,0 +1,162 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// -- setr --------------------------------------------------------------------- + +void bli_setrm + ( + obj_t* alpha, + obj_t* b + ) +{ + obj_t alpha_real; + obj_t br; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_setm_check( alpha, b ); + + // Initialize a local scalar, alpha_real, using the real projection + // of the datatype of b. + bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( b ), + &alpha_real ); + + // Copy/typecast alpha to alpha_real. This discards the imaginary + // part of alpha (if it is complex). + bli_copysc( alpha, &alpha_real ); + + // Acquire an alias to the real part of b. + bli_obj_real_part( b, &br ); + + // Use setm to set the real part of b to alpha_real. + bli_setm( &alpha_real, &br ); +} + +void bli_setrv + ( + obj_t* alpha, + obj_t* x + ) +{ + obj_t alpha_real; + obj_t xr; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_setv_check( alpha, x ); + + // Initialize a local scalar, alpha_real, using the real projection + // of the datatype of x. + bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( x ), + &alpha_real ); + + // Copy/typecast alpha to alpha_real. This discards the imaginary + // part of alpha (if it is complex). + bli_copysc( alpha, &alpha_real ); + + // Acquire an alias to the real part of x. + bli_obj_real_part( x, &xr ); + + // Use setv to set the real part of x to alpha_real. + bli_setv( &alpha_real, &xr ); +} + +// -- seti --------------------------------------------------------------------- + +void bli_setim + ( + obj_t* alpha, + obj_t* b + ) +{ + obj_t alpha_real; + obj_t bi; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_setm_check( alpha, b ); + + // If the object is real, return early. + if ( bli_obj_is_real( b ) ) return; + + // Initialize a local scalar, alpha_real, using the real projection + // of the datatype of b. + bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( b ), + &alpha_real ); + + // Copy/typecast alpha to alpha_real. This discards the imaginary + // part of alpha (if it is complex). + bli_copysc( alpha, &alpha_real ); + + // Acquire an alias to the imaginary part of b. + bli_obj_imag_part( b, &bi ); + + // Use setm to set the imaginary part of b to alpha_real. + bli_setm( &alpha_real, &bi ); +} + +void bli_setiv + ( + obj_t* alpha, + obj_t* x + ) +{ + obj_t alpha_real; + obj_t xi; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_setv_check( alpha, x ); + + // If the object is real, return early. + if ( bli_obj_is_real( x ) ) return; + + // Initialize a local scalar, alpha_real, using the real projection + // of the datatype of x. + bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( x ), + &alpha_real ); + + // Copy/typecast alpha to alpha_real. This discards the imaginary + // part of alpha (if it is complex). + bli_copysc( alpha, &alpha_real ); + + // Acquire an alias to the imaginary part of x. + bli_obj_imag_part( x, &xi ); + + // Use setm to set the imaginary part of x to alpha_real. + bli_setm( &alpha_real, &xi ); +} + diff --git a/frame/base/bli_setri.h b/frame/base/bli_setri.h new file mode 100644 index 000000000..a08eeaad1 --- /dev/null +++ b/frame/base/bli_setri.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// -- setr --------------------------------------------------------------------- + +void bli_setrm + ( + obj_t* alpha, + obj_t* b + ); + +void bli_setrv + ( + obj_t* alpha, + obj_t* x + ); + +// -- seti --------------------------------------------------------------------- + +void bli_setim + ( + obj_t* alpha, + obj_t* b + ); + +void bli_setiv + ( + obj_t* alpha, + obj_t* x + ); + diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index aaa939ea0..869a70b48 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -909,58 +909,6 @@ static void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) } } -// Make a full alias (shallow copy). - -static void bli_obj_alias_to( obj_t* a, obj_t* b ) -{ - bli_obj_init_full_shallow_copy_of( a, b ); -} - -// Alias only the real part. - -static void bli_obj_real_part( obj_t* c, obj_t* r ) -{ - bli_obj_alias_to( c, r ); - - // Change the datatype. - num_t dt_r = bli_obj_dt_proj_to_real( c ); - bli_obj_set_dt( dt_r, r ); - - // Update the element size. - siz_t es_c = bli_obj_elem_size( c ); - bli_obj_set_elem_size( es_c/2, r ); - - // Update the strides. - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); -} - -// Check if two objects are aliases of one another. - -static bool_t bli_obj_is_alias_of( obj_t* a, obj_t* b ) -{ - return ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); -} - - -// Create an alias with a trans value applied. -// (Note: trans may include a conj component.) - -static void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) -{ - bli_obj_alias_to( a, b ); - bli_obj_apply_trans( trans, b ); -} - -// Create an alias with a conj value applied. - -static void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) -{ - bli_obj_alias_to( a, b ); - bli_obj_apply_conj( conja, b ); -} - // Initialize object with default properties (info field). static void bli_obj_set_defaults( obj_t* obj ) @@ -1041,6 +989,91 @@ static void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) ); } +// Make a full alias (shallow copy). + +static void bli_obj_alias_to( obj_t* a, obj_t* b ) +{ + bli_obj_init_full_shallow_copy_of( a, b ); +} + +// Check if two objects are aliases of one another. + +static bool_t bli_obj_is_alias_of( obj_t* a, obj_t* b ) +{ + return ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); +} + + +// Create an alias with a trans value applied. +// (Note: trans may include a conj component.) + +static void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) +{ + bli_obj_alias_to( a, b ); + bli_obj_apply_trans( trans, b ); +} + +// Create an alias with a conj value applied. + +static void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) +{ + bli_obj_alias_to( a, b ); + bli_obj_apply_conj( conja, b ); +} + +// Alias only the real part. + +static void bli_obj_real_part( obj_t* c, obj_t* r ) +{ + bli_obj_alias_to( c, r ); + + if ( bli_obj_is_complex( c ) ) + { + // Change the datatype. + num_t dt_r = bli_obj_dt_proj_to_real( c ); + bli_obj_set_dt( dt_r, r ); + + // Update the element size. + siz_t es_c = bli_obj_elem_size( c ); + bli_obj_set_elem_size( es_c/2, r ); + + // Update the strides. + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); + + // Buffer is left unchanged. + } +} + +// Alias only the imaginary part. + +static void bli_obj_imag_part( obj_t* c, obj_t* i ) +{ + if ( bli_obj_is_complex( c ) ) + { + bli_obj_alias_to( c, i ); + + // Change the datatype. + num_t dt_r = bli_obj_dt_proj_to_real( c ); + bli_obj_set_dt( dt_r, i ); + + // Update the element size. + siz_t es_c = bli_obj_elem_size( c ); + bli_obj_set_elem_size( es_c/2, i ); + + // Update the strides. + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); + + // Update the buffer. + inc_t is_c = bli_obj_imag_stride( c ); + char* p = bli_obj_buffer_at_off( c ); + bli_obj_set_buffer( p + is_c * es_c/2, i ); + } +} + // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is diff --git a/frame/include/blis.h b/frame/include/blis.h index 3e82f40a7..25f6f53af 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -123,6 +123,7 @@ extern "C" { #include "bli_string.h" #include "bli_setgetij.h" #include "bli_proj.h" +#include "bli_setri.h" // -- Level-0 operations -- From 262a62e3482c5caa947a89cabb562b5887555bd6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 8 Jun 2018 12:10:54 -0500 Subject: [PATCH 23/37] Fixed undefined ref in steamroller/excavator configs. Details: - Fixed erroneous calls to bli_cntx_init_piledriver_ref() in bli_cntx_init_steamroller() and bli_cntx_init_excavator(), which should have been to their respectively-named bli_cntx_init_*() functions instead. Thanks to qnerd for bringing these bugs to our attention. --- config/excavator/bli_cntx_init_excavator.c | 2 +- config/steamroller/bli_cntx_init_steamroller.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c index 065154d27..56d04ef4e 100644 --- a/config/excavator/bli_cntx_init_excavator.c +++ b/config/excavator/bli_cntx_init_excavator.c @@ -39,7 +39,7 @@ void bli_cntx_init_excavator( cntx_t* cntx ) blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. - bli_cntx_init_piledriver_ref( cntx ); + bli_cntx_init_excavator_ref( cntx ); // ------------------------------------------------------------------------- diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c index b1409e4fc..1b6566c5c 100644 --- a/config/steamroller/bli_cntx_init_steamroller.c +++ b/config/steamroller/bli_cntx_init_steamroller.c @@ -39,7 +39,7 @@ void bli_cntx_init_steamroller( cntx_t* cntx ) blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. - bli_cntx_init_piledriver_ref( cntx ); + bli_cntx_init_steamroller_ref( cntx ); // ------------------------------------------------------------------------- From f1908d39767baef56077def69126d96f805ee27e Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 8 Jun 2018 14:22:22 -0500 Subject: [PATCH 24/37] Fixed broken input.operations.fast. Details: - Removed three input lines from input.operations.fast (labeled "test sequential micro-kernel") that I intended to remove in bd02c4e. These lines prevented 'make check' (and 'make checkblis-fast') from completing correctly. Note: This bug was fixed in 3df39b3, but that commit has not yet been merged into master, hence this redundant commit. Thanks to Robert van de Geijn for reporting this issue. --- testsuite/input.operations.fast | 3 --- 1 file changed, 3 deletions(-) diff --git a/testsuite/input.operations.fast b/testsuite/input.operations.fast index d86de6ecc..d2a44276e 100644 --- a/testsuite/input.operations.fast +++ b/testsuite/input.operations.fast @@ -260,15 +260,12 @@ # --- Level-3 micro-kernels ------------------------------------------------ 1 # gemm -1 # test sequential micro-kernel -1 # dimensions: k 1 # trsm -1 # test sequential micro-kernel ? # parameters: uploa 1 # gemmtrsm -1 # test sequential micro-kernel -1 # dimensions: k ? # parameters: uploa From 043d0cd37ef4a27b1901eeb89d40083cfb2a57ba Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 9 Jun 2018 13:46:49 -0500 Subject: [PATCH 25/37] Implemented bli_acquire_mpart(), added example code. Details: - Implemented bli_acquire_mpart(), a general-purpose submatrix view function that will alias an obj_t to be a submatrix "view" of an existing obj_t. - Renumbered examples in examples/oapi and inserted a new example file, 03obj_view.c, which shows how to use bli_acquire_mpart() to obtain submatrix views of existing objects, which can then be used to indirectly modify the parent object. --- examples/oapi/{0obj_basic.c => 00obj_basic.c} | 0 .../oapi/{1obj_attach.c => 01obj_attach.c} | 0 examples/oapi/{2obj_ij.c => 02obj_ij.c} | 0 examples/oapi/03obj_view.c | 272 ++++++++++++++++++ examples/oapi/{3level0.c => 04level0.c} | 0 examples/oapi/{4level1v.c => 05level1v.c} | 0 examples/oapi/{5level1m.c => 06level1m.c} | 0 .../{6level1m_diag.c => 07level1m_diag.c} | 0 examples/oapi/{7level2.c => 08level2.c} | 0 examples/oapi/{8level3.c => 09level3.c} | 0 examples/oapi/{9util.c => 10util.c} | 0 examples/oapi/Makefile | 21 +- examples/oapi/README | 2 +- frame/base/bli_part.c | 43 +++ frame/base/bli_part.h | 10 + 15 files changed, 337 insertions(+), 11 deletions(-) rename examples/oapi/{0obj_basic.c => 00obj_basic.c} (100%) rename examples/oapi/{1obj_attach.c => 01obj_attach.c} (100%) rename examples/oapi/{2obj_ij.c => 02obj_ij.c} (100%) create mode 100644 examples/oapi/03obj_view.c rename examples/oapi/{3level0.c => 04level0.c} (100%) rename examples/oapi/{4level1v.c => 05level1v.c} (100%) rename examples/oapi/{5level1m.c => 06level1m.c} (100%) rename examples/oapi/{6level1m_diag.c => 07level1m_diag.c} (100%) rename examples/oapi/{7level2.c => 08level2.c} (100%) rename examples/oapi/{8level3.c => 09level3.c} (100%) rename examples/oapi/{9util.c => 10util.c} (100%) diff --git a/examples/oapi/0obj_basic.c b/examples/oapi/00obj_basic.c similarity index 100% rename from examples/oapi/0obj_basic.c rename to examples/oapi/00obj_basic.c diff --git a/examples/oapi/1obj_attach.c b/examples/oapi/01obj_attach.c similarity index 100% rename from examples/oapi/1obj_attach.c rename to examples/oapi/01obj_attach.c diff --git a/examples/oapi/2obj_ij.c b/examples/oapi/02obj_ij.c similarity index 100% rename from examples/oapi/2obj_ij.c rename to examples/oapi/02obj_ij.c diff --git a/examples/oapi/03obj_view.c b/examples/oapi/03obj_view.c new file mode 100644 index 000000000..a3dd4b247 --- /dev/null +++ b/examples/oapi/03obj_view.c @@ -0,0 +1,272 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include +#include "blis.h" + +void init_dmatrix_by_rows( dim_t m, dim_t n, double* a, inc_t rs, inc_t cs ); +void init_dmatrix_by_cols( dim_t m, dim_t n, double* a, inc_t rs, inc_t cs ); +void init_dobj_by_cols( obj_t* a ); +void init_zobj_by_cols( obj_t* a ); + +int main( int argc, char** argv ) +{ + obj_t a1, a2; + obj_t v1, v2, v3, v4, v5; + num_t dt; + dim_t m, n; + inc_t rs, cs; + dim_t i, j; + dim_t mv, nv; + + // + // This file demonstrates creating and submatrix views into existing matrices. + // + + // + // Example 1: Create an object and then create a submatrix view. + // + + printf( "\n#\n# -- Example 1 --\n#\n\n" ); + + // We'll use these parameters for the following examples. + dt = BLIS_DOUBLE; + m = 6; n = 7; rs = 1; cs = m; + + // Create an object a1 using bli_obj_create(). + bli_obj_create( dt, m, n, rs, cs, &a1 ); + + // Initialize a1 to contain known values. + init_dobj_by_cols( &a1 ) + + bli_printm( "matrix 'a1' (initial state)", &a1, "%5.1f", "" ); + + // Acquire a 4x3 submatrix view into a1 at (i,j) offsets (1,2). + i = 1; j = 2; mv = 4; nv = 3; + bli_acquire_mpart( i, j, mv, nv, &a1, &v1 ); + + bli_printm( "4x3 submatrix 'v1' at offsets (1,2)", &v1, "%5.1f", "" ); + + // NOTE: Submatrix views should never be passed to bli_obj_free(). It + // will not cause an immediate error, but it is bad practice. Instead, + // you should only release the objects that were created directy via + // bli_obj_create(). In the above example, that means only object a1 + // would be passed to bli_obj_free(). + + // + // Example 2: Modify the contents of a submatrix view. + // + + printf( "\n#\n# -- Example 2 --\n#\n\n" ); + + // Modify the first three elements of the first column. + bli_setijm( -3.0, 0.0, 0, 0, &v1 ); + bli_setijm( -4.0, 0.0, 1, 0, &v1 ); + bli_setijm( -5.0, 0.0, 2, 0, &v1 ); + + // Modify the first three elements of the second column. + bli_setijm( -6.0, 0.0, 0, 1, &v1 ); + bli_setijm( -7.0, 0.0, 1, 1, &v1 ); + bli_setijm( -8.0, 0.0, 2, 1, &v1 ); + + // Print the matrix again so we can see the update elements. + bli_printm( "submatrix view 'v1' (modified state)", &v1, "%5.1f", "" ); + bli_printm( "matrix 'a1' (indirectly modified due to changes to 'v1')", &a1, "%5.1f", "" ); + + // + // Example 3: Create a submatrix view that is "too big". + // + + printf( "\n#\n# -- Example 3 --\n#\n\n" ); + + // bli_acquire_mpart() will safely truncate your requested submatrix + // view dimensions (or even the offsets) if they extend beyond the + // bounds of the parent object. + + bli_printm( "matrix 'a1' (current state)", &a1, "%5.1f", "" ); + + // Acquire a 4x3 submatrix view into a1 at offsets (4,2). Notice how + // the requested view contains four rows, but the view is created with + // only two rows because the starting m offset of 4 leaves only two rows + // left in the parent matrix. + bli_acquire_mpart( 4, 2, 4, 3, &a1, &v2 ); + + bli_printm( "4x3 submatrix 'v2' at offsets (4,2) -- two rows truncated for safety", &v2, "%5.1f", "" ); + + // + // Example 4: Create a bufferless object, attach an external buffer, and + // then create a submatrix view. + // + + printf( "\n#\n# -- Example 4 --\n#\n\n" ); + + // Create a object with known elements using the same approach as the + // previous example file. + double* p1 = malloc( m * n * sizeof( double ) ); + init_dmatrix_by_cols( m, n, p1, rs, cs ); + bli_obj_create_with_attached_buffer( dt, m, n, p1, rs, cs, &a2 ); + + bli_printm( "matrix 'a2' (initial state)", &a2, "%5.1f", "" ); + + // Acquire a 3x4 submatrix view at offset (2,3). + bli_acquire_mpart( 2, 3, 3, 4, &a2, &v3 ); + + bli_printm( "3x4 submatrix view 'v3' at offsets (2,3)", &v3, "%5.1f", "" ); + + // + // Example 5: Use a submatrix view to set a region of a larger matrix to + // zero. + // + + printf( "\n#\n# -- Example 5 --\n#\n\n" ); + + bli_printm( "3x4 submatrix view 'v3' at offsets (2,3)", &v3, "%5.1f", "" ); + + bli_setm( &BLIS_ZERO, &v3 ); + + bli_printm( "3x4 submatrix view 'v3' (zeroed out)", &v3, "%5.1f", "" ); + + bli_printm( "matrix 'a2' (modified state)", &a2, "%5.1f", "" ); + + // + // Example 6: Obtain a submatrix view into a submatrix view. + // + + printf( "\n#\n# -- Example 6 --\n#\n\n" ); + + bli_acquire_mpart( 1, 1, 5, 6, &a2, &v4 ); + + bli_printm( "5x6 submatrix view 'v4' at offsets (1,1) of 'a2'", &v4, "%5.1f", "" ); + + bli_acquire_mpart( 1, 0, 4, 5, &v4, &v5 ); + + bli_printm( "4x5 submatrix view 'v5' at offsets (1,0) of 'v4'", &v5, "%5.1f", "" ); + + + // Free the memory arrays we allocated. + free( p1 ); + + // Free the objects we created. + bli_obj_free( &a1 ); + + return 0; +} + +// ----------------------------------------------------------------------------- + +void init_dmatrix_by_rows( dim_t m, dim_t n, double* a, inc_t rs, inc_t cs ) +{ + dim_t i, j; + + double alpha = 0.0; + + // Step through a matrix by rows, assigning each element a unique + // value, starting at 0. + for ( i = 0; i < m; ++i ) + { + for ( j = 0; j < n; ++j ) + { + double* a_ij = a + i*rs + j*cs; + + *a_ij = alpha; + + alpha += 1.0; + } + } +} + +void init_dmatrix_by_cols( dim_t m, dim_t n, double* a, inc_t rs, inc_t cs ) +{ + dim_t i, j; + + double alpha = 0.0; + + // Step through a matrix by columns, assigning each element a unique + // value, starting at 0. + for ( j = 0; j < n; ++j ) + { + for ( i = 0; i < m; ++i ) + { + double* a_ij = a + i*rs + j*cs; + + *a_ij = alpha; + + alpha += 1.0; + } + } +} + +void init_dobj_by_cols( obj_t* a ) +{ + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t i, j; + + double alpha = 0.0; + + // Step through a matrix by columns, assigning each element a unique + // value, starting at 0. + for ( j = 0; j < n; ++j ) + { + for ( i = 0; i < m; ++i ) + { + bli_setijm( alpha, 0.0, i, j, a ); + + alpha += 1.0; + } + } +} + +void init_zobj_by_cols( obj_t* a ) +{ + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t i, j; + + double alpha = 0.0; + + // Step through a matrix by columns, assigning each real and imaginary + // element a unique value, starting at 0. + for ( j = 0; j < n; ++j ) + { + for ( i = 0; i < m; ++i ) + { + bli_setijm( alpha, alpha + 1.0, i, j, a ); + + alpha += 2.0; + } + } +} + diff --git a/examples/oapi/3level0.c b/examples/oapi/04level0.c similarity index 100% rename from examples/oapi/3level0.c rename to examples/oapi/04level0.c diff --git a/examples/oapi/4level1v.c b/examples/oapi/05level1v.c similarity index 100% rename from examples/oapi/4level1v.c rename to examples/oapi/05level1v.c diff --git a/examples/oapi/5level1m.c b/examples/oapi/06level1m.c similarity index 100% rename from examples/oapi/5level1m.c rename to examples/oapi/06level1m.c diff --git a/examples/oapi/6level1m_diag.c b/examples/oapi/07level1m_diag.c similarity index 100% rename from examples/oapi/6level1m_diag.c rename to examples/oapi/07level1m_diag.c diff --git a/examples/oapi/7level2.c b/examples/oapi/08level2.c similarity index 100% rename from examples/oapi/7level2.c rename to examples/oapi/08level2.c diff --git a/examples/oapi/8level3.c b/examples/oapi/09level3.c similarity index 100% rename from examples/oapi/8level3.c rename to examples/oapi/09level3.c diff --git a/examples/oapi/9util.c b/examples/oapi/10util.c similarity index 100% rename from examples/oapi/9util.c rename to examples/oapi/10util.c diff --git a/examples/oapi/Makefile b/examples/oapi/Makefile index 905ef6727..08964e479 100644 --- a/examples/oapi/Makefile +++ b/examples/oapi/Makefile @@ -105,16 +105,17 @@ CFLAGS += -I$(TEST_SRC_PATH) LIBBLIS_LINK := $(BUILD_PATH)/$(LIBBLIS_LINK) # Binary executable name. -TEST_BINS := 0obj_basic.x \ - 1obj_attach.x \ - 2obj_ij.x \ - 3level0.x \ - 4level1v.x \ - 5level1m.x \ - 6level1m_diag.x \ - 7level2.x \ - 8level3.x \ - 9util.x +TEST_BINS := 00obj_basic.x \ + 01obj_attach.x \ + 02obj_ij.x \ + 03obj_view.x \ + 04level0.x \ + 05level1v.x \ + 06level1m.x \ + 07level1m_diag.x \ + 08level2.x \ + 09level3.x \ + 10util.x diff --git a/examples/oapi/README b/examples/oapi/README index 28cc6d84e..adf7ded9d 100644 --- a/examples/oapi/README +++ b/examples/oapi/README @@ -6,7 +6,7 @@ This directory contains several files, each containing various pieces of example code that demonstrate core functionality of the object API in BLIS. These example files should be thought of collectively like a tutorial, and therefore it is recommended to start from the beginning (the file that -starts in '0'). +starts in '00'). You can build all of the examples by simply running 'make' from this directory. (You can also run 'make clean'.) The makefile assumes that diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index 47fa4fdfd..d05eabb79 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -38,6 +38,49 @@ // -- Matrix partitioning ------------------------------------------------------ +void bli_acquire_mpart + ( + dim_t i, + dim_t j, + dim_t bm, + dim_t bn, + obj_t* parent, + obj_t* child + ) +{ + // Query the dimensions of the parent object. + const dim_t m_par = bli_obj_length( parent ); + const dim_t n_par = bli_obj_width( parent ); + + // If either i or j is already beyond what exists of the parent matrix, + // slide them back to the outer dimensions. (What will happen in this + // scenario is that bm and bn and/or will be reduced to zero so that the + // child matrix does not refer to anything beyond the bounds of the + // parent. (Note: This is a safety measure and generally should never + // be needed if the caller is passing in sane arguments.) + if ( i > m_par ) i = m_par; + if ( j > n_par ) j = n_par; + + // If either bm or bn spills out over the edge of the parent matrix, + // reduce them so that the child matrix fits within the bounds of the + // parent. (Note: This is a safety measure and generally should never + // be needed if the caller is passing in sane arguments, though this + // code is somewhat more likely to be needed than the code above.) + if ( bm > m_par - i ) bm = m_par - i; + if ( bn > n_par - j ) bn = n_par - j; + + // Alias the parent object's contents into the child object. + bli_obj_alias_to( parent, child ); + + // Set the offsets and dimensions of the child object. Note that we + // increment, rather than overwrite, the offsets of the child object + // in case the parent object already had non-zero offsets (usually + // because the parent was itself a child a larger grandparent object). + bli_obj_inc_offs( i, j, child ); + bli_obj_set_dims( bm, bn, child ); +} + + void bli_acquire_mpart_mdim ( dir_t direct, diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h index fd24f1d82..284a87ffa 100644 --- a/frame/base/bli_part.h +++ b/frame/base/bli_part.h @@ -36,6 +36,16 @@ // -- Matrix partitioning ------------------------------------------------------ +void bli_acquire_mpart + ( + dim_t i, + dim_t j, + dim_t m, + dim_t n, + obj_t* obj, + obj_t* sub_obj + ); + #undef GENPROT #define GENPROT( opname ) \ \ From 712de9b371a8727682352a2f52cd4880de905f0b Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 9 Jun 2018 14:36:30 -0500 Subject: [PATCH 26/37] Added missing semicolon in 03obj_view.c Details: - Thanks to Tony Skjellum for pointing out this typo due to a last-minute change to the source prior to committing. --- examples/oapi/03obj_view.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/oapi/03obj_view.c b/examples/oapi/03obj_view.c index a3dd4b247..68c9c922f 100644 --- a/examples/oapi/03obj_view.c +++ b/examples/oapi/03obj_view.c @@ -69,7 +69,7 @@ int main( int argc, char** argv ) bli_obj_create( dt, m, n, rs, cs, &a1 ); // Initialize a1 to contain known values. - init_dobj_by_cols( &a1 ) + init_dobj_by_cols( &a1 ); bli_printm( "matrix 'a1' (initial state)", &a1, "%5.1f", "" ); From 2610fff0b07bdb345cb2e334ef6bea0c63c8cead Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 11 Jun 2018 12:32:54 -0500 Subject: [PATCH 27/37] Renamed 1m packm kernels from _1e to _1er. Details: - Renamed the reference packm kernels used by 1m. Previously, they used a _1e suffix, which was confusing since they packed to both 1e and 1r schemas. This was likely an artifact of the time when there were separate kernels for each schema before I decided to combine them into a single function (per datatype and panel dimension), and the 1e functions were the ones to inherit the 1r functionality. The kernels have now been renamed to use a _1er suffix. --- ref_kernels/1m/bli_packm_cxk_1er_ref.c | 18 +++++++++--------- ref_kernels/bli_cntx_ref.c | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c index 3c526506d..693fd3c47 100644 --- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c +++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c @@ -189,7 +189,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_2xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_2xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -364,7 +364,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_4xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_4xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -555,7 +555,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_6xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_6xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -762,7 +762,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_8xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_8xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -985,7 +985,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_10xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_10xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -1224,7 +1224,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_12xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_12xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -1479,7 +1479,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_14xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_14xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -1750,7 +1750,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_16xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_16xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) @@ -2133,5 +2133,5 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -INSERT_GENTFUNCCO_BASIC2( packm_30xk_1e, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC2( packm_30xk_1er, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index cc5828f4d..81df4f9c4 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -221,23 +221,23 @@ #define packm_30xk_rih_ker_name GENARNAME(packm_30xk_rih) #undef packm_2xk_1er_ker_name -#define packm_2xk_1er_ker_name GENARNAME(packm_2xk_1e) +#define packm_2xk_1er_ker_name GENARNAME(packm_2xk_1er) #undef packm_4xk_1er_ker_name -#define packm_4xk_1er_ker_name GENARNAME(packm_4xk_1e) +#define packm_4xk_1er_ker_name GENARNAME(packm_4xk_1er) #undef packm_6xk_1er_ker_name -#define packm_6xk_1er_ker_name GENARNAME(packm_6xk_1e) +#define packm_6xk_1er_ker_name GENARNAME(packm_6xk_1er) #undef packm_8xk_1er_ker_name -#define packm_8xk_1er_ker_name GENARNAME(packm_8xk_1e) +#define packm_8xk_1er_ker_name GENARNAME(packm_8xk_1er) #undef packm_10xk_1er_ker_name -#define packm_10xk_1er_ker_name GENARNAME(packm_10xk_1e) +#define packm_10xk_1er_ker_name GENARNAME(packm_10xk_1er) #undef packm_12xk_1er_ker_name -#define packm_12xk_1er_ker_name GENARNAME(packm_12xk_1e) +#define packm_12xk_1er_ker_name GENARNAME(packm_12xk_1er) #undef packm_14xk_1er_ker_name -#define packm_14xk_1er_ker_name GENARNAME(packm_14xk_1e) +#define packm_14xk_1er_ker_name GENARNAME(packm_14xk_1er) #undef packm_16xk_1er_ker_name -#define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1e) +#define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1er) #undef packm_30xk_1er_ker_name -#define packm_30xk_1er_ker_name GENARNAME(packm_30xk_1e) +#define packm_30xk_1er_ker_name GENARNAME(packm_30xk_1er) // Include the level-1m kernel API template. #include "bli_l1m_ker.h" From 87db5c048e0c7f37351fda486abaf7d19fc5821c Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 12 Jun 2018 19:38:37 -0500 Subject: [PATCH 28/37] Changed usage of virtual microkernel slots in cntx. Details: - Changed the way virtual microkernels are handled in the context. Previously, there were query routines such as bli_cntx_get_l3_ukr_dt() which returned the native ukernel for a datatype if the method was equal to BLIS_NAT, or the virtual ukernel for that datatype if the method was some other value. Going forward, the context native and virtual ukernel slots will both be initialized to native ukernel function pointers for native execution, and for non-native execution the virtual ukernel pointer will be something else. This allows us to always query the virtual ukernel slot (from within, say, the macrokernel) without needing any logic in the query routine to decide which function pointer (native or virtual) to return. (Essentially, the logic has been shifted to init-time instead of compute-time.) This scheme will also allow generalized virtual ukernels as a way to insert extra logic in between the macrokernel and the native microkernel. - Initialize native contexts (in bli_cntx_ref.c) with native ukernel function addresses stored to the virtual ukernel slots pursuant to the above policy change. - Renamed all static functions that were native/virtual-ambiguous, such as bli_cntx_get_l3_ukr_dt() or bli_cntx_l3_ukr_prefers_cols_dt() pursuant to the above polilcy change. Those routines now use the substring "get_l3_vir_ukr" in their name instead of "get_l3_ukr". All of these functions were static functions defined in bli_cntx.h, and most uses were in level-3 front-ends and macrokernels. - Deprecated anti_pref bool_t in context, along with related functions such as bli_cntx_l3_ukr_eff_dislikes_storage_of(), now that 1m's panel-block execution is disabled. --- frame/3/bli_l3_ukr_tapi.c | 6 +- frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 4 +- frame/3/gemm/ind/bli_gemm4mb_ker_var2.c | 4 +- frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c | 4 +- frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/her2k/bli_her2k_front.c | 2 +- frame/3/herk/bli_herk_front.c | 2 +- frame/3/herk/bli_herk_l_ker_var2.c | 4 +- frame/3/herk/bli_herk_u_ker_var2.c | 4 +- frame/3/symm/bli_symm_front.c | 2 +- frame/3/syr2k/bli_syr2k_front.c | 2 +- frame/3/syrk/bli_syrk_front.c | 2 +- frame/3/trmm/bli_trmm_front.c | 2 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 4 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 4 +- frame/3/trmm3/bli_trmm3_front.c | 2 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 6 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 6 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 6 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 6 +- frame/base/bli_cntx.c | 11 ++- frame/base/bli_cntx.h | 95 ++++----------------- frame/base/bli_gks.c | 2 +- frame/include/bli_type_defs.h | 2 - ref_kernels/3/bli_gemmtrsm_ref.c | 4 +- ref_kernels/bli_cntx_ref.c | 20 ++--- ref_kernels/ind/bli_gemm1m_ref.c | 2 +- ref_kernels/ind/bli_gemmtrsm1m_ref.c | 2 +- sandbox/ref99/blx_gemm_front.c | 2 +- sandbox/ref99/vars/blx_gemm_ker_var2.c | 4 +- 33 files changed, 86 insertions(+), 142 deletions(-) diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c index 1836e2f6a..44f557029 100644 --- a/frame/3/bli_l3_ukr_tapi.c +++ b/frame/3/bli_l3_ukr_tapi.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,opname) \ \ /* Query the context for the function address of the current datatype's micro-kernel. */ \ - PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_ukr_dt( dt, kerid, cntx ); \ + PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ f( \ @@ -91,7 +91,7 @@ void PASTEMAC(ch,opname) \ \ /* Query the context for the function address of the current datatype's micro-kernel. */ \ - PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_ukr_dt( dt, kerid, cntx ); \ + PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ f( \ @@ -129,7 +129,7 @@ void PASTEMAC(ch,opname) \ \ /* Query the context for the function address of the current datatype's micro-kernel. */ \ - PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_ukr_dt( dt, kerid, cntx ); \ + PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ f( \ diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 8aae5b476..841c4a1c2 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -77,7 +77,7 @@ void bli_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 2e05deaad..db74118bc 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -192,7 +192,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index 7cb809904..8d927e295 100644 --- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -163,13 +163,13 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c b/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c index d711fd946..4045fa74c 100644 --- a/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c +++ b/frame/3/gemm/ind/old/bli_gemm3m2_ker_var2.c @@ -163,13 +163,13 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index b12424d63..f53fb888c 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -72,7 +72,7 @@ void bli_hemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_toggle_conj( &a_local ); diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 15ee65fad..e8eadc8e2 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -92,7 +92,7 @@ void bli_her2k_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &bh_local ); bli_obj_swap( &b_local, &ah_local ); diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index f6e5b55a3..50ea17b8f 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -77,7 +77,7 @@ void bli_herk_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_toggle_conj( &a_local ); bli_obj_toggle_conj( &ah_local ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 3ee105140..ebc3be486 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -168,7 +168,7 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -177,7 +177,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index b58f600e5..3d74a0543 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -168,7 +168,7 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -177,7 +177,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 84263bc9d..ba646ce92 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -72,7 +72,7 @@ void bli_symm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &b_local ); diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 769ca56a0..35231980d 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -81,7 +81,7 @@ void bli_syr2k_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( &c_local ); } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 7a66ad68a..819214dfe 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -74,7 +74,7 @@ void bli_syrk_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( &c_local ); } diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 935972442..d6c692126 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -105,7 +105,7 @@ void bli_trmm_front // NOTE: We disable the optimization for 1x1 matrices since the concept // of row- vs. column storage breaks down. if ( !bli_obj_is_1x1( &c_local ) ) - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 8d5c4d0f0..854b9ce5f 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index e54d7d582..9f9036129 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 919eab1a3..75d2a346e 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index bc4907772..203432b13 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -160,7 +160,7 @@ void PASTEMAC(ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -169,7 +169,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 0f772f0fb..c5e561a0d 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -104,7 +104,7 @@ void bli_trmm3_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index bf7f647de..693a79006 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -162,9 +162,9 @@ void PASTEMAC(ch,varname) \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -173,7 +173,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 4b3c4c4b3..0daa91639 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -162,9 +162,9 @@ void PASTEMAC(ch,varname) \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -173,7 +173,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 7f4b93bd3..820142f27 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -167,9 +167,9 @@ void PASTEMAC(ch,varname) \ is transposed so that all kernel instances are of the "left" variety (since those are the only trsm ukernels that exist). */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index d91b4b0d0..9d9e3a040 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -167,9 +167,9 @@ void PASTEMAC(ch,varname) \ is transposed so that all kernel instances are of the "left" variety (since those are the only trsm ukernels that exist). */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ + gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 8e2d2c391..662e4585b 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -544,8 +544,10 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) // -- End variable argument section -- // Query the context for the addresses of: + // - the l3 virtual ukernel func_t array // - the l3 native ukernel func_t array // - the l3 native ukernel preferences array + func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); @@ -565,11 +567,18 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) // Index into the func_t and mbool_t for the current kernel id // being processed. + func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ]; func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ]; mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ]; // Store the ukernel function pointer and preference values into - // the context. + // the context. Notice that we redundantly store the native + // ukernel address in both the native and virtual ukernel slots + // in the context. This is standard practice when creating a + // native context. (Induced method contexts will overwrite the + // virtual function pointer with the address of the appropriate + // virtual ukernel.) + bli_func_set_dt( ukr_fp, ukr_dt, vukrs ); bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); } diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 4aaec97c4..14963ba67 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -60,8 +60,6 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; - bool_t anti_pref; - dim_t* thrloop; membrk_t* membrk; @@ -126,10 +124,6 @@ static pack_t bli_cntx_schema_c_panel( cntx_t* cntx ) { return cntx->schema_c_panel; } -static bool_t bli_cntx_anti_pref( cntx_t* cntx ) -{ - return cntx->anti_pref; -} static dim_t* bli_cntx_thrloop( cntx_t* cntx ) { return cntx->thrloop; @@ -166,10 +160,6 @@ static void bli_cntx_set_schema_ab_blockpanel( pack_t sa, pack_t sb, cntx_t* cnt bli_cntx_set_schema_a_block( sa, cntx ); bli_cntx_set_schema_b_panel( sb, cntx ); } -static void bli_cntx_set_anti_pref( bool_t anti_pref, cntx_t* cntx ) -{ - cntx->anti_pref = anti_pref; -} static void bli_cntx_set_membrk( membrk_t* membrk, cntx_t* cntx ) { cntx->membrk = membrk; @@ -234,27 +224,6 @@ static dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) // ----------------------------------------------------------------------------- -static func_t* bli_cntx_get_l3_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* funcs; - - if ( bli_cntx_method( (cntx) ) != BLIS_NAT ) - funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); - else - funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); - - func_t* func = &funcs[ ukr_id ]; - - return func; -} - -static void* bli_cntx_get_l3_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l3_ukrs( ukr_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); @@ -487,55 +456,43 @@ static bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_i return !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } -static bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); - - // If the anti-preference is set, negate the result. - if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; - - return r_val; -} - -static bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); - - // If the anti-preference is set, negate the result. - if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; - - return r_val; -} - // ----------------------------------------------------------------------------- -static bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +static bool_t bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. + // NOTE: This projection to real domain becomes unnecessary if you + // set the exec_dt for 1m to the real projection of the storage + // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } -static bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +static bool_t bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. + // NOTE: This projection to real domain becomes unnecessary if you + // set the exec_dt for 1m to the real projection of the storage + // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } -static bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) +static bool_t bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { - const num_t dt = bli_obj_dt( obj ); + // Note that we use the execution datatype, which may differ from the + // storage datatype of C (though this would happen in very few situations). + const num_t dt = bli_obj_exec_dt( obj ); const bool_t ukr_prefers_rows - = bli_cntx_l3_ukr_prefers_rows_dt( dt, ukr_id, cntx ); + = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool_t ukr_prefers_cols - = bli_cntx_l3_ukr_prefers_cols_dt( dt, ukr_id, cntx ); + = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool_t r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; @@ -544,29 +501,9 @@ static bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cn return r_val; } -static bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) +static bool_t bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { - return !bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); -} - -static bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); - - // If the anti-preference is set, negate the result. - if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; - - return r_val; -} - -static bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); - - // If the anti-preference is set, negate the result. - if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; - - return r_val; + return !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 884d655b4..02b20fb32 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -584,7 +584,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) // then query the ukernel function pointer for the given datatype from // that context. cntx_t* cntx = bli_gks_query_ind_cntx( method, dt ); - void* fp = bli_cntx_get_l3_ukr_dt( dt, ukr, cntx ); + void* fp = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx ); // Check whether the ukernel function pointer is NULL for the given // datatype. If it is NULL, return the string for not applicable. diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index a097ddfc8..a844e1971 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1128,8 +1128,6 @@ typedef struct cntx_s pack_t schema_b_panel; pack_t schema_c_panel; - bool_t anti_pref; - dim_t thrloop[ BLIS_NUM_LOOPS ]; membrk_t* membrk; diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c index 3657a2092..9ce0ead42 100644 --- a/ref_kernels/3/bli_gemmtrsm_ref.c +++ b/ref_kernels/3/bli_gemmtrsm_ref.c @@ -60,9 +60,9 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ - trsm_ukr = bli_cntx_get_l3_ukr_dt( dt, trsmkerid, cntx ); \ + trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ \ /* lower: b11 = alpha * b11 - a10 * b01; */ \ /* upper: b11 = alpha * b11 - a12 * b21; */ \ diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 81df4f9c4..f8b72fc15 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -363,11 +363,11 @@ void GENBARNAME(cntx_init) funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); - gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm1m_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm1m_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm1m_u_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); // -- Set level-3 native micro-kernels and preferences --------------------- @@ -467,7 +467,7 @@ void GENBARNAME(cntx_init) bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED, cntx ); - bli_cntx_set_anti_pref( FALSE, cntx ); + //bli_cntx_set_anti_pref( FALSE, cntx ); bli_cntx_set_thrloop( 1, 1, 1, 1, 1, cntx ); @@ -726,7 +726,7 @@ void GENBAINAME(cntx_init) // Initialize the blocksizes according to the micro-kernel preference as // well as the algorithm. - if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) { // This branch is used for algorithms 1m_c_bp, 1m_r_pb. @@ -754,7 +754,7 @@ void GENBAINAME(cntx_init) cntx ); } - else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { // This branch is used for algorithms 1m_r_bp, 1m_c_pb. @@ -811,7 +811,7 @@ void GENBAINAME(cntx_init) } else if ( method == BLIS_1M ) { - const bool_t is_pb = FALSE; + //const bool_t is_pb = FALSE; // Set the anti-preference field to TRUE when executing a panel-block // algorithm, and FALSE otherwise. This will cause higher-level generic @@ -819,7 +819,7 @@ void GENBAINAME(cntx_init) // the micro-kernel output preference so that the two will come back into // agreement in the panel-block macro-kernel (which implemented in terms // of the block-panel macro-kernel with some induced transpositions). - bli_cntx_set_anti_pref( is_pb, cntx ); + //bli_cntx_set_anti_pref( is_pb, cntx ); } else // if ( method == BLIS_NAT ) { diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c index 53b51f303..e8cd01175 100644 --- a/ref_kernels/ind/bli_gemm1m_ref.c +++ b/ref_kernels/ind/bli_gemm1m_ref.c @@ -54,7 +54,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const bool_t row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c index 5782d79aa..1a88f7eec 100644 --- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c +++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c @@ -59,7 +59,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTECH(ch,trsm_ukr_ft) \ ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ diff --git a/sandbox/ref99/blx_gemm_front.c b/sandbox/ref99/blx_gemm_front.c index 13615109b..ba5863ad5 100644 --- a/sandbox/ref99/blx_gemm_front.c +++ b/sandbox/ref99/blx_gemm_front.c @@ -87,7 +87,7 @@ void blx_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/sandbox/ref99/vars/blx_gemm_ker_var2.c b/sandbox/ref99/vars/blx_gemm_ker_var2.c index 1942f987c..6a291c8c7 100644 --- a/sandbox/ref99/vars/blx_gemm_ker_var2.c +++ b/sandbox/ref99/vars/blx_gemm_ker_var2.c @@ -171,7 +171,7 @@ void PASTECH2(blx_,ch,varname) \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -180,7 +180,7 @@ void PASTECH2(blx_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ From f88c2e7a539e383297e846e6d4647058dd3db128 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 13 Jun 2018 18:27:46 -0500 Subject: [PATCH 29/37] Defined static function bli_blksz_scale_def_max(). Details: - Added a new static function to bli_blksz.h that scales both the default (regular) blocksize as well as the maximum blocksize in the blksz_t object. Reminder: maximum blocksizes have different meanings in different contexts. For register blocksizes, they refer to the packing register blocksizes (PACKMR or PACKNR) while for cache blocksizes, they refer to the maximum blocksize to use during the final iteration of a loop. --- frame/base/bli_blksz.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index 5be816775..31c24d93b 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -172,6 +172,18 @@ static void bli_blksz_scale_max bli_blksz_set_max( ( val * num ) / den, dt, b ); } +static void bli_blksz_scale_def_max + ( + dim_t num, + dim_t den, + num_t dt, + blksz_t* b + ) +{ + bli_blksz_scale_def( num, den, dt, b ); + bli_blksz_scale_max( num, den, dt, b ); +} + // ----------------------------------------------------------------------------- blksz_t* bli_blksz_create_ed From 1b5d0424d2c7e5eac33e02359c12917ef280949f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 13 Jun 2018 18:41:32 -0500 Subject: [PATCH 30/37] Prototype column-preferential zen gemm ukernels. Details: - Added prototypes to bli_kernels_zen.h for each of the four gemm microkernels that prefer outputting to column storage. --- kernels/zen/bli_kernels_zen.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 2c829127f..6c3fdefb2 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -84,6 +84,12 @@ GEMM_UKR_PROT( double, d, gemm_zen_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_3x4 ) +// gemm (asm d8x6) +GEMM_UKR_PROT( float, s, gemm_zen_asm_16x6 ) +GEMM_UKR_PROT( double, d, gemm_zen_asm_8x6 ) +GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_8x3 ) +GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_4x3 ) + // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_zen_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen_asm_6x8 ) From 22594e8e9ab55f5bc0e69d96a23e128502849999 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 14 Jun 2018 17:35:23 -0500 Subject: [PATCH 31/37] Updated sandbox/ref99 according to f97a86f. Details: - Applied changes to ref99 sandbox analagous to those applied to framework code in f97a86f. This involves setting the pack schemas of A and B objects temporarily to communicate those desired schemas to the control tree creation function in blx_gemm_cntl.c. This allows us to (henceforth) query the schemas from the control tree rather than the context. --- sandbox/ref99/blx_gemm_front.c | 21 +++++++++++++++++++++ sandbox/ref99/cntl/blx_gemm_cntl.c | 14 +++++++++----- sandbox/ref99/cntl/blx_gemm_cntl.h | 8 ++++++-- sandbox/ref99/cntl/blx_l3_cntl_if.c | 17 ++++++++++++++++- 4 files changed, 52 insertions(+), 8 deletions(-) diff --git a/sandbox/ref99/blx_gemm_front.c b/sandbox/ref99/blx_gemm_front.c index ba5863ad5..c2ac1ccf7 100644 --- a/sandbox/ref99/blx_gemm_front.c +++ b/sandbox/ref99/blx_gemm_front.c @@ -96,6 +96,27 @@ void blx_gemm_front bli_obj_induce_trans( &c_local ); } + { + // A sort of hack for communicating the desired pach schemas for A and + // B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, + // particularly in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } + } + // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env ( diff --git a/sandbox/ref99/cntl/blx_gemm_cntl.c b/sandbox/ref99/cntl/blx_gemm_cntl.c index 169161d54..4f499e614 100644 --- a/sandbox/ref99/cntl/blx_gemm_cntl.c +++ b/sandbox/ref99/cntl/blx_gemm_cntl.c @@ -37,17 +37,21 @@ cntl_t* blx_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { - return blx_gemmbp_cntl_create( family ); + return blx_gemmbp_cntl_create( family, schema_a, schema_b ); } // ----------------------------------------------------------------------------- cntl_t* blx_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = blx_gemm_ker_var2; @@ -79,7 +83,7 @@ cntl_t* blx_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); @@ -103,7 +107,7 @@ cntl_t* blx_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_bp ); diff --git a/sandbox/ref99/cntl/blx_gemm_cntl.h b/sandbox/ref99/cntl/blx_gemm_cntl.h index 637ead73e..59d7589a4 100644 --- a/sandbox/ref99/cntl/blx_gemm_cntl.h +++ b/sandbox/ref99/cntl/blx_gemm_cntl.h @@ -34,14 +34,18 @@ cntl_t* blx_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); // ----------------------------------------------------------------------------- cntl_t* blx_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); // ----------------------------------------------------------------------------- diff --git a/sandbox/ref99/cntl/blx_l3_cntl_if.c b/sandbox/ref99/cntl/blx_l3_cntl_if.c index 264bfb930..2eddb4360 100644 --- a/sandbox/ref99/cntl/blx_l3_cntl_if.c +++ b/sandbox/ref99/cntl/blx_l3_cntl_if.c @@ -46,11 +46,26 @@ void blx_l3_cntl_create_if cntl_t** cntl_use ) { + // This is part of a hack to support mixed domain in bli_gemm_front(). + // Sometimes we need to specify a non-standard schema for A and B, and + // we decided to transmit them via the schema field in the obj_t's + // rather than pass them in as function parameters. Once the values + // have been read, we immediately reset them back to their expected + // values for unpacked objects. Notice that we do this even if the + // caller passed in a custom control tree; that's because we still need + // to reset the pack schema of a and b, which were modified by the + // operation's _front() function. + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); + // If the control tree pointer is NULL, we construct a default // tree as a function of the operation family. if ( cntl_orig == NULL ) { - *cntl_use = blx_gemm_cntl_create( family ); + *cntl_use = blx_gemm_cntl_create( family, schema_a, schema_b ); } else { From ed20392c500940bfc0947795c1ff7c8c24f8e26f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 15 Jun 2018 16:31:22 -0500 Subject: [PATCH 32/37] Added get/set static funcs for exec dt/dom/prec. Details: - Added functions to bli_obj_macro_defs.h to get and set the execution domain and execution precision bits in the obj_t. - Added/rearranged a few functions in bli_obj_macro_defs.h. - Renamed some macros in bli_type_defs.h: EXECUTION -> EXEC. --- frame/include/bli_obj_macro_defs.h | 49 ++++++++++++++++++++++-------- frame/include/bli_type_defs.h | 12 +++++--- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 18ac0757a..c5b19d578 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -81,6 +81,21 @@ static objbits_t bli_obj_domain( obj_t* obj ) return ( obj->info & BLIS_DOMAIN_BIT ); } +static objbits_t bli_obj_prec( obj_t* obj ) +{ + return ( obj->info & BLIS_PRECISION_BIT ); +} + +static bool_t bli_obj_is_single_prec( obj_t* obj ) +{ + return ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); +} + +static bool_t bli_obj_is_double_prec( obj_t* obj ) +{ + return ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); +} + static bool_t bli_obj_is_real( obj_t* obj ) { return ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL ); @@ -91,16 +106,6 @@ static bool_t bli_obj_is_complex( obj_t* obj ) return ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX ); } -static objbits_t bli_obj_prec( obj_t* obj ) -{ - return ( obj->info & BLIS_PRECISION_BIT ); -} - -static bool_t bli_obj_is_double_prec( obj_t* obj ) -{ - return ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); -} - static num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); @@ -118,7 +123,17 @@ static num_t bli_obj_target_dt( obj_t* obj ) static num_t bli_obj_exec_dt( obj_t* obj ) { - return ( ( obj->info & BLIS_EXECUTION_DT_BITS ) >> BLIS_EXECUTION_DT_SHIFT ); + return ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); +} + +static dom_t bli_obj_exec_domain( obj_t* obj ) +{ + return ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); +} + +static prec_t bli_obj_exec_prec( obj_t* obj ) +{ + return ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } static trans_t bli_obj_conjtrans_status( obj_t* obj ) @@ -328,7 +343,17 @@ static void bli_obj_set_target_dt( num_t dt, obj_t* obj ) static void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { - obj->info = ( obj->info & ~BLIS_EXECUTION_DT_BITS ) | ( dt << BLIS_EXECUTION_DT_SHIFT ); + obj->info = ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ); +} + +static void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) +{ + obj->info = ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DOMAIN_SHIFT ); +} + +static void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) +{ + obj->info = ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_PREC_SHIFT ); } static void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index a844e1971..fe0d84a51 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -210,11 +210,11 @@ typedef dcomplex f77_dcomplex; 12 ~ 10 Target numerical datatype - 10: domain (0 == real, 1 == complex) - 11: precision (0 == single, 1 == double) - - 12: unused + - 12: used to encode integer, constant types 15 ~ 13 Execution numerical datatype - 13: domain (0 == real, 1 == complex) - 14: precision (0 == single, 1 == double) - - 15: unused + - 15: used to encode integer, constant types 22 ~ 16 Packed type/status - 0 0000 00: not packed - 1 0000 00: packed (unspecified; by rows, columns, or vector) @@ -271,7 +271,9 @@ typedef dcomplex f77_dcomplex; #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 -#define BLIS_EXECUTION_DT_SHIFT 13 +#define BLIS_EXEC_DT_SHIFT 13 +#define BLIS_EXEC_DOMAIN_SHIFT 13 +#define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 @@ -299,7 +301,9 @@ typedef dcomplex f77_dcomplex; #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) -#define BLIS_EXECUTION_DT_BITS ( 0x7 << BLIS_EXECUTION_DT_SHIFT ) +#define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) +#define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) +#define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) From e88a5b8da8c26caebd2b0fb73b30836fb5417c9c Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 18 Jun 2018 15:56:26 -0500 Subject: [PATCH 33/37] Implemented castm, castv operations. Details: - Implemented castm and castv operations, which behave like copym and copyv except where the obj_t operands can be of different datatypes. These new operations, however, unlike copym/copyv, do not build upon existing level-1v kernels. - Reorganized projm, projv into a 'proj' subdirectory of frame/base (to match the newly added frame/base/cast directory). - Added new macros to bli_gentfunc_macro_defs.h, _gentprot_macro_defs.h that insert GENTFUNC2/GENTPROT2 macros for all non-homogeneous datatype combinations. Previously, one had to invoke two additional macros--one which mixed domains only and another that included all remaining cases--in order to get full type combination coverage. - Defined a new static function, bli_set_dims_incs_2m(), to aid in the setting of various variables in the implementations of bli_??castm(). This static function joins others like it in bli_param_macro_defs.h. - Comment update to bli_copysc.h. --- frame/0/copysc/bli_copysc.h | 2 +- frame/base/cast/bli_castm.c | 267 ++++++++++++++++++ frame/base/cast/bli_castm.h | 73 +++++ frame/base/cast/bli_castv.c | 211 ++++++++++++++ frame/base/cast/bli_castv.h | 72 +++++ frame/base/cast/old/bli_cast_check.c | 118 ++++++++ .../{bli_proj.h => cast/old/bli_cast_check.h} | 6 +- frame/base/{bli_proj.c => proj/bli_projm.c} | 66 ++--- frame/base/proj/bli_projm.h | 46 +++ frame/base/proj/bli_projv.c | 127 +++++++++ frame/base/proj/bli_projv.h | 46 +++ .../base/{check => proj/old}/bli_proj_check.c | 0 .../base/{check => proj/old}/bli_proj_check.h | 0 frame/include/bli_gentfunc_macro_defs.h | 45 +++ frame/include/bli_gentprot_macro_defs.h | 44 +++ frame/include/bli_param_macro_defs.h | 37 ++- frame/include/blis.h | 6 +- 17 files changed, 1122 insertions(+), 44 deletions(-) create mode 100644 frame/base/cast/bli_castm.c create mode 100644 frame/base/cast/bli_castm.h create mode 100644 frame/base/cast/bli_castv.c create mode 100644 frame/base/cast/bli_castv.h create mode 100644 frame/base/cast/old/bli_cast_check.c rename frame/base/{bli_proj.h => cast/old/bli_cast_check.h} (96%) rename frame/base/{bli_proj.c => proj/bli_projm.c} (72%) create mode 100644 frame/base/proj/bli_projm.h create mode 100644 frame/base/proj/bli_projv.c create mode 100644 frame/base/proj/bli_projv.h rename frame/base/{check => proj/old}/bli_proj_check.c (100%) rename frame/base/{check => proj/old}/bli_proj_check.h (100%) diff --git a/frame/0/copysc/bli_copysc.h b/frame/0/copysc/bli_copysc.h index dbddb5079..8022f4889 100644 --- a/frame/0/copysc/bli_copysc.h +++ b/frame/0/copysc/bli_copysc.h @@ -49,7 +49,7 @@ GENFRONT( copysc ) // -// Define BLAS-like interfaces with heterogeneous-typed operands. +// Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c new file mode 100644 index 000000000..84d4c8ca6 --- /dev/null +++ b/frame/base/cast/bli_castm.c @@ -0,0 +1,267 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// NOTE: This is one of the few functions in BLIS that is defined +// with heterogeneous type support. This is done so that we have +// an operation that can be used to typecast (copy-cast) a matrix +// of one datatype to a scalar of another datatype. + +typedef void (*FUNCPTR_T) + ( + trans_t transa, + dim_t m, + dim_t n, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict b, inc_t rs_b, inc_t cs_b + ); + +static FUNCPTR_T GENARRAY2_ALL(ftypes,castm); + +// +// Define object-based interface. +// + +void bli_castm + ( + obj_t* a, + obj_t* b + ) +{ + num_t dt_a = bli_obj_dt( a ); + num_t dt_b = bli_obj_dt( b ); + + trans_t transa = bli_obj_conjtrans_status( a ); + + dim_t m = bli_obj_length( b ); + dim_t n = bli_obj_width( b ); + + void* buf_a = bli_obj_buffer_at_off( a ); + inc_t rs_a = bli_obj_row_stride( a ); + inc_t cs_a = bli_obj_col_stride( a ); + + void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b = bli_obj_row_stride( b ); + inc_t cs_b = bli_obj_col_stride( b ); + + FUNCPTR_T f; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_castm_check( a, b ); + +#if 0 + if ( bli_obj_dt( a ) == bli_obj_dt( b ) ) + { + // If a and b share the same datatype, we can simply use copym. + bli_copym( a, b ); + return; + } +#endif + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_a][dt_b]; + + // Invoke the void pointer-based function. + f + ( + transa, + m, + n, + buf_a, rs_a, cs_a, + buf_b, rs_b, cs_b + ); +} + +// ----------------------------------------------------------------------------- + +// +// Define BLAS-like interfaces with typed operands. +// + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \ +\ +void PASTEMAC2(cha,chb,opname) \ + ( \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict b, inc_t rs_b, inc_t cs_b \ + ) \ +{ \ + ctype_a* restrict a_cast = a; \ + ctype_b* restrict b_cast = b; \ + conj_t conja; \ + dim_t n_iter; \ + dim_t n_elem; \ + inc_t lda, inca; \ + inc_t ldb, incb; \ + dim_t j, i; \ +\ + /* Set various loop parameters. */ \ + bli_set_dims_incs_2m \ + ( \ + transa, \ + m, n, rs_a, cs_a, rs_b, cs_b, \ + &n_elem, &n_iter, &inca, &lda, &incb, &ldb \ + ); \ +\ + /* Extract the conjugation component from the transa parameter. */ \ + conja = bli_extract_conj( transa ); \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + if ( inca == 1 && incb == 1 ) \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copyjs)( a1[i], b1[i] ); \ + } \ + } \ + } \ + else \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copyjs)( *a1, *b1 ); \ +\ + a1 += inca; \ + b1 += incb; \ + } \ + } \ + } \ + } \ + else \ + { \ + if ( inca == 1 && incb == 1 ) \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copys)( a1[i], b1[i] ); \ + } \ + } \ + } \ + else \ + { \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ +\ + for ( i = 0; i < n_elem; ++i ) \ + { \ + PASTEMAC2(cha,chb,copys)( *a1, *b1 ); \ +\ + a1 += inca; \ + b1 += incb; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC2_BASIC0( castm ) +INSERT_GENTFUNC2_MIXDP0( castm ) + +// ----------------------------------------------------------------------------- + +// +// Define object-based _check() function. +// + +void bli_castm_check + ( + obj_t* a, + obj_t* b + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + // Check structure. + // NOTE: We enforce general structure for now in order to simplify the + // implementation. + + bli_check_general_object( a ); + bli_check_error_code( e_val ); + + bli_check_general_object( b ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_conformal_dims( a, b ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); +} + diff --git a/frame/base/cast/bli_castm.h b/frame/base/cast/bli_castm.h new file mode 100644 index 000000000..353f25f33 --- /dev/null +++ b/frame/base/cast/bli_castm.h @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// Prototype object-based interface. +// + +void bli_castm + ( + obj_t* a, + obj_t* b + ); + +// +// Prototype BLAS-like interfaces with heterogeneous-typed operands. +// + +#undef GENTPROT2 +#define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ +\ +void PASTEMAC2(cha,chb,opname) \ + ( \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b \ + ); + +INSERT_GENTPROT2_BASIC0( castm ) +INSERT_GENTPROT2_MIXDP0( castm ) + +// +// Prototype object-based _check() function. +// + +void bli_castm_check + ( + obj_t* a, + obj_t* b + ); + diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c new file mode 100644 index 000000000..e6af84f3b --- /dev/null +++ b/frame/base/cast/bli_castv.c @@ -0,0 +1,211 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// NOTE: This is one of the few functions in BLIS that is defined +// with heterogeneous type support. This is done so that we have +// an operation that can be used to typecast (copy-cast) a matrix +// of one datatype to a scalar of another datatype. + +typedef void (*FUNCPTR_T) + ( + conj_t conjx, + dim_t n, + void* restrict x, inc_t inc_x, + void* restrict y, inc_t inc_y + ); + +static FUNCPTR_T GENARRAY2_ALL(ftypes,castv); + +// +// Define object-based interface. +// + +void bli_castv + ( + obj_t* x, + obj_t* y + ) +{ + num_t dt_x = bli_obj_dt( x ); + num_t dt_y = bli_obj_dt( y ); + + conj_t conjx = bli_obj_conj_status( x ); + + dim_t n = bli_obj_vector_dim( x ); + + void* buf_x = bli_obj_buffer_at_off( x ); + inc_t inc_x = bli_obj_vector_inc( x ); + + void* buf_y = bli_obj_buffer_at_off( y ); + inc_t inc_y = bli_obj_vector_inc( y ); + + FUNCPTR_T f; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_castv_check( x, y ); + +#if 0 + if ( bli_obj_dt( x ) == bli_obj_dt( y ) ) + { + // If x and y share the same datatype, we can simply use copyv. + bli_copyv( x, y ); + return; + } +#endif + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y]; + + // Invoke the void pointer-based function. + f + ( + conjx, + n, + buf_x, inc_x, + buf_y, inc_y + ); +} + +// ----------------------------------------------------------------------------- + +// +// Define BLAS-like interfaces with typed operands. +// + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \ +\ +void PASTEMAC2(chx,chy,opname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + void* restrict x, inc_t incx, \ + void* restrict y, inc_t incy \ + ) \ +{ \ + ctype_x* restrict x1 = x; \ + ctype_y* restrict y1 = y; \ + dim_t i; \ +\ + if ( bli_is_conj( conjx ) ) \ + { \ + if ( incx == 1 && incy == 1 ) \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + PASTEMAC2(chx,chy,copyjs)( x1[i], y1[i] ); \ + } \ + } \ + else \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + PASTEMAC2(chx,chy,copyjs)( *x1, *y1 ); \ +\ + x1 += incx; \ + y1 += incy; \ + } \ + } \ + } \ + else \ + { \ + if ( incx == 1 && incy == 1 ) \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + PASTEMAC2(chx,chy,copys)( x1[i], y1[i] ); \ + } \ + } \ + else \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + PASTEMAC2(chx,chy,copys)( *x1, *y1 ); \ +\ + x1 += incx; \ + y1 += incy; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC2_BASIC0( castv ) +INSERT_GENTFUNC2_MIXDP0( castv ) + +// ----------------------------------------------------------------------------- + +// +// Define object-based _check() function. +// + +void bli_castv_check + ( + obj_t* x, + obj_t* y + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( y ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_vector_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_object( y ); + bli_check_error_code( e_val ); + + e_val = bli_check_equal_vector_lengths( x, y ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( y ); + bli_check_error_code( e_val ); +} + diff --git a/frame/base/cast/bli_castv.h b/frame/base/cast/bli_castv.h new file mode 100644 index 000000000..1e1175184 --- /dev/null +++ b/frame/base/cast/bli_castv.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// Prototype object-based interface. +// + +void bli_castv + ( + obj_t* x, + obj_t* y + ); + +// +// Prototype BLAS-like interfaces with heterogeneous-typed operands. +// + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ +\ +void PASTEMAC2(chx,chy,opname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC0( castv ) +INSERT_GENTPROT2_MIXDP0( castv ) + +// +// Prototype object-based _check() function. +// + +void bli_castv_check + ( + obj_t* x, + obj_t* y + ); + diff --git a/frame/base/cast/old/bli_cast_check.c b/frame/base/cast/old/bli_cast_check.c new file mode 100644 index 000000000..3d2ea0b6f --- /dev/null +++ b/frame/base/cast/old/bli_cast_check.c @@ -0,0 +1,118 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_castm_check + ( + obj_t* a, + obj_t* b + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + // Check structure. + // NOTE: We enforce general structure for now in order to simplify the + // implementation. + + bli_check_general_object( a ); + bli_check_error_code( e_val ); + + bli_check_general_object( b ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_conformal_dims( a, b ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); +} + +void bli_castv_check + ( + obj_t* x, + obj_t* y + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( y ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_vector_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_object( y ); + bli_check_error_code( e_val ); + + e_val = bli_check_equal_vector_lengths( x, y ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( y ); + bli_check_error_code( e_val ); +} + diff --git a/frame/base/bli_proj.h b/frame/base/cast/old/bli_cast_check.h similarity index 96% rename from frame/base/bli_proj.h rename to frame/base/cast/old/bli_cast_check.h index 39e02f7be..eb3356b8e 100644 --- a/frame/base/bli_proj.h +++ b/frame/base/cast/old/bli_cast_check.h @@ -32,15 +32,13 @@ */ -#include "bli_proj_check.h" - -void bli_projm +void bli_castm_check ( obj_t* a, obj_t* b ); -void bli_projv +void bli_castv_check ( obj_t* x, obj_t* y diff --git a/frame/base/bli_proj.c b/frame/base/proj/bli_projm.c similarity index 72% rename from frame/base/bli_proj.c rename to frame/base/proj/bli_projm.c index 5a09a2c51..e208a79f3 100644 --- a/frame/base/bli_proj.c +++ b/frame/base/proj/bli_projm.c @@ -84,52 +84,44 @@ void bli_projm } } -void bli_projv +// ----------------------------------------------------------------------------- + +void bli_projm_check ( - obj_t* x, - obj_t* y + obj_t* a, + obj_t* b ) { - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_projv_check( x, y ); + err_t e_val; - if ( ( bli_obj_is_real( x ) && bli_obj_is_real( y ) ) || - ( bli_obj_is_complex( x ) && bli_obj_is_complex( y ) ) ) - { - // If x and y are both real or both complex, we can simply use - // copyv. - bli_copyv( x, y ); - } - else - { - // This branch handles the case where one operand is real and - // the other is complex. + // Check object datatypes. - if ( bli_obj_is_real( x ) /* && bli_obj_is_complex( y ) */ ) - { - // If x is real and y is complex, we must obtain the real part - // of y so that we can copy x into the real part (after - // initializing all of y, including imaginary components, to - // zero). + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); - obj_t yr; + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); - bli_obj_real_part( y, &yr ); + e_val = bli_check_consistent_object_precisions( a, b ); + bli_check_error_code( e_val ); - bli_setv( &BLIS_ZERO, y ); - bli_copyv( x, &yr ); - } - else // bli_obj_is_complex( x ) && bli_obj_is_real( y ) - { - // If x is complex and y is real, we can simply copy the - // real part of x into y. + // Check object dimensions. - obj_t xr; + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); - bli_obj_real_part( x, &xr ); + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); - bli_copyv( &xr, y ); - } - } + e_val = bli_check_conformal_dims( a, b ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); } + diff --git a/frame/base/proj/bli_projm.h b/frame/base/proj/bli_projm.h new file mode 100644 index 000000000..154b67ed2 --- /dev/null +++ b/frame/base/proj/bli_projm.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_projm + ( + obj_t* a, + obj_t* b + ); + +void bli_projm_check + ( + obj_t* a, + obj_t* b + ); + diff --git a/frame/base/proj/bli_projv.c b/frame/base/proj/bli_projv.c new file mode 100644 index 000000000..fcad4d890 --- /dev/null +++ b/frame/base/proj/bli_projv.c @@ -0,0 +1,127 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_projv + ( + obj_t* x, + obj_t* y + ) +{ + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_projv_check( x, y ); + + if ( ( bli_obj_is_real( x ) && bli_obj_is_real( y ) ) || + ( bli_obj_is_complex( x ) && bli_obj_is_complex( y ) ) ) + { + // If x and y are both real or both complex, we can simply use + // copyv. + bli_copyv( x, y ); + } + else + { + // This branch handles the case where one operand is real and + // the other is complex. + + if ( bli_obj_is_real( x ) /* && bli_obj_is_complex( y ) */ ) + { + // If x is real and y is complex, we must obtain the real part + // of y so that we can copy x into the real part (after + // initializing all of y, including imaginary components, to + // zero). + + obj_t yr; + + bli_obj_real_part( y, &yr ); + + bli_setv( &BLIS_ZERO, y ); + bli_copyv( x, &yr ); + } + else // bli_obj_is_complex( x ) && bli_obj_is_real( y ) + { + // If x is complex and y is real, we can simply copy the + // real part of x into y. + + obj_t xr; + + bli_obj_real_part( x, &xr ); + + bli_copyv( &xr, y ); + } + } +} + +// ----------------------------------------------------------------------------- + +void bli_projv_check + ( + obj_t* x, + obj_t* y + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( y ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_precisions( x, y ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_vector_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_object( y ); + bli_check_error_code( e_val ); + + e_val = bli_check_equal_vector_lengths( x, y ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( y ); + bli_check_error_code( e_val ); +} + diff --git a/frame/base/proj/bli_projv.h b/frame/base/proj/bli_projv.h new file mode 100644 index 000000000..8b504a685 --- /dev/null +++ b/frame/base/proj/bli_projv.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_projv + ( + obj_t* x, + obj_t* y + ); + +void bli_projv_check + ( + obj_t* x, + obj_t* y + ); + diff --git a/frame/base/check/bli_proj_check.c b/frame/base/proj/old/bli_proj_check.c similarity index 100% rename from frame/base/check/bli_proj_check.c rename to frame/base/proj/old/bli_proj_check.c diff --git a/frame/base/check/bli_proj_check.h b/frame/base/proj/old/bli_proj_check.h similarity index 100% rename from frame/base/check/bli_proj_check.h rename to frame/base/proj/old/bli_proj_check.h diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 6d980b1fa..a52904cc4 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -402,6 +402,51 @@ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ +// -- Mixed domain/precision (all) two-operand macro -- + +// -- (no auxiliary arguments) -- + +#define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ +\ +GENTFUNC2( float, double, s, d, tfuncname ) \ +GENTFUNC2( float, scomplex, s, c, tfuncname ) \ +GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ +\ +GENTFUNC2( double, float, d, s, tfuncname ) \ +GENTFUNC2( double, scomplex, d, c, tfuncname ) \ +GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ +\ +GENTFUNC2( scomplex, float, c, s, tfuncname ) \ +GENTFUNC2( scomplex, double, c, d, tfuncname ) \ +GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ +\ +GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ +GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ +GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) + + +// -- (one auxiliary argument) -- + +#define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ +\ +GENTFUNC2( float, double, s, d, tfuncname, varname ) \ +GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ +GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ +\ +GENTFUNC2( double, float, d, s, tfuncname, varname ) \ +GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ +GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ +\ +GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ +GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ +GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ +\ +GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ +GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ +GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) + + + // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- diff --git a/frame/include/bli_gentprot_macro_defs.h b/frame/include/bli_gentprot_macro_defs.h index 5d685cc56..e0ba84ff9 100644 --- a/frame/include/bli_gentprot_macro_defs.h +++ b/frame/include/bli_gentprot_macro_defs.h @@ -395,6 +395,50 @@ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ +// -- Mixed domain/precision (all) two-operand macro -- + +// -- (no auxiliary arguments) -- + +#define INSERT_GENTPROT2_MIXDP0( funcname ) \ +\ +GENTPROT2( float, double, s, d, funcname ) \ +GENTPROT2( float, scomplex, s, c, funcname ) \ +GENTPROT2( float, dcomplex, s, z, funcname ) \ +\ +GENTPROT2( double, float, d, s, funcname ) \ +GENTPROT2( double, scomplex, d, c, funcname ) \ +GENTPROT2( double, dcomplex, d, z, funcname ) \ +\ +GENTPROT2( scomplex, float, c, s, funcname ) \ +GENTPROT2( scomplex, double, c, d, funcname ) \ +GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ +\ +GENTPROT2( dcomplex, float, z, s, funcname ) \ +GENTPROT2( dcomplex, double, z, d, funcname ) \ +GENTPROT2( dcomplex, scomplex, z, c, funcname ) + +// -- (one auxiliary argument) -- + +#define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ +\ +GENTPROT2( float, double, s, d, tfuncname, varname ) \ +GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ +GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ +\ +GENTPROT2( double, float, d, s, tfuncname, varname ) \ +GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ +GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ +\ +GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ +GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ +GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ +\ +GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ +GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ +GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) + + + // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index ee61b5728..5584f8ed0 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -990,6 +990,41 @@ void bli_set_dims_incs_uplo_1m_noswap } } +// Set dimensions and increments for TWO matrix arguments. + +static +void bli_set_dims_incs_2m + ( + trans_t transa, + dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, + inc_t rs_b, inc_t cs_b, + dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, + inc_t* incb, inc_t* ldb + ) +{ + { + *n_iter = n; + *n_elem = m; + *inca = rs_a; + *lda = cs_a; + *incb = rs_b; + *ldb = cs_b; + + if ( bli_does_trans( transa ) ) + { + bli_swap_incs( inca, lda ); + } + + if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && + bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) + { + bli_swap_dims( n_iter, n_elem ); + bli_swap_incs( inca, lda ); + bli_swap_incs( incb, ldb ); + } + } +} + // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. @@ -1033,7 +1068,7 @@ void bli_set_dims_incs_uplo_2m if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; - n_iter_max_ = n; + n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; diff --git a/frame/include/blis.h b/frame/include/blis.h index 25f6f53af..8e1803f4b 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -122,9 +122,13 @@ extern "C" { #include "bli_cpuid.h" #include "bli_string.h" #include "bli_setgetij.h" -#include "bli_proj.h" #include "bli_setri.h" +#include "bli_castm.h" +#include "bli_castv.h" +#include "bli_projm.h" +#include "bli_projv.h" + // -- Level-0 operations -- From f317c2e31bfc329cb6bb4e06005e45b9c8a9d6a7 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 19 Jun 2018 12:21:23 -0500 Subject: [PATCH 34/37] Added get/set static funcs for exec dt/dom/prec. Details: - Added functions to bli_obj_macro_defs.h to get and set the target domain and target precision bits in the obj_t, and also added the appropriate support in bli_type_defs.h. --- frame/include/bli_obj_macro_defs.h | 20 ++++++++++++++++++++ frame/include/bli_type_defs.h | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index c5b19d578..60176d98c 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -121,6 +121,16 @@ static num_t bli_obj_target_dt( obj_t* obj ) return ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } +static dom_t bli_obj_target_domain( obj_t* obj ) +{ + return ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); +} + +static prec_t bli_obj_target_prec( obj_t* obj ) +{ + return ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); +} + static num_t bli_obj_exec_dt( obj_t* obj ) { return ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); @@ -341,6 +351,16 @@ static void bli_obj_set_target_dt( num_t dt, obj_t* obj ) obj->info = ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ); } +static void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) +{ + obj->info = ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DOMAIN_SHIFT ); +} + +static void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) +{ + obj->info = ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_PREC_SHIFT ); +} + static void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ); diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index fe0d84a51..2d400518d 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -271,6 +271,8 @@ typedef dcomplex f77_dcomplex; #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 +#define BLIS_TARGET_DOMAIN_SHIFT 10 +#define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 @@ -301,6 +303,8 @@ typedef dcomplex f77_dcomplex; #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) +#define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) +#define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) From d4a22702c7a90273dc14f271db465c2e11e5b87e Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 19 Jun 2018 14:54:57 -0500 Subject: [PATCH 35/37] Set up haswell config for optional col-pref ukrs. Details: - Added two presently-disabled cpp blocks in bli_cntx_init_haswell.c to easily allow one to switch to a set of column-preferential gemm microkernels (in the haswell subconfiguration). The second column- preferring block sets the the register blocksizes to their appropriate values. However, cache blocksizes are left unchanged, and therefore are likely suboptimal. This should be addressed later. --- config/haswell/bli_cntx_init_haswell.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index 2823277a9..6d794430c 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -49,10 +49,17 @@ void bli_cntx_init_haswell( cntx_t* cntx ) ( 8, // gemm +#if 1 BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_zen_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4, TRUE, +#else + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_zen_asm_16x6, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen_asm_8x6, FALSE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen_asm_8x3, FALSE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_4x3, FALSE, +#endif // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_zen_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen_asm_6x8, TRUE, @@ -108,8 +115,13 @@ void bli_cntx_init_haswell( cntx_t* cntx ) // Initialize level-3 blocksize objects with architecture-specific values. // s d c z +#if 1 bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); +#else + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 6, 6, 3, 3 ); +#endif bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); From 5f7fbb7115b1bf532c169dfd9adef84c41a95031 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 19 Jun 2018 15:38:55 -0500 Subject: [PATCH 36/37] Static funcs for projecting dt to single/double. Details: - Added static functions for projecting a datatype to single precision or double precision, both for obj_t's storage datatypes and standalone datatypes. --- frame/include/bli_obj_macro_defs.h | 10 ++++++++++ frame/include/bli_param_macro_defs.h | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 60176d98c..bf1259a57 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -96,6 +96,16 @@ static bool_t bli_obj_is_double_prec( obj_t* obj ) return ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } +static num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) +{ + return ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); +} + +static num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) +{ + return ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); +} + static bool_t bli_obj_is_real( obj_t* obj ) { return ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL ); diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 5584f8ed0..e8343d804 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -122,6 +122,16 @@ static num_t bli_dt_proj_to_complex( num_t dt ) return ( dt | BLIS_BITVAL_COMPLEX ); } +static num_t bli_dt_proj_to_single_prec( num_t dt ) +{ + return ( dt & ~BLIS_BITVAL_SINGLE_PREC ); +} + +static num_t bli_dt_proj_to_double_prec( num_t dt ) +{ + return ( dt | BLIS_BITVAL_DOUBLE_PREC ); +} + // trans From 17928b1c9941aa58aef1f122c793e2b14e705267 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 19 Jun 2018 17:59:03 -0500 Subject: [PATCH 37/37] Added static funcs bli_dt_domain(), bli_dt_prec(). Details: - Added definitions of static functions bli_dt_domain()/bli_dt_prec(), which extract a dom_t domain or prec_t precision value, respectively, from a num_t datatype. - Changed the return types of bli_obj_domain() and bli_obj_prec() from objbits_t to dom_t and prec_t. (Not sure why they were ever set to return objbits_t.) --- frame/include/bli_obj_macro_defs.h | 4 ++-- frame/include/bli_param_macro_defs.h | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index bf1259a57..a09fdfaae 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -76,12 +76,12 @@ static bool_t bli_obj_is_const( obj_t* obj ) return ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } -static objbits_t bli_obj_domain( obj_t* obj ) +static dom_t bli_obj_domain( obj_t* obj ) { return ( obj->info & BLIS_DOMAIN_BIT ); } -static objbits_t bli_obj_prec( obj_t* obj ) +static prec_t bli_obj_prec( obj_t* obj ) { return ( obj->info & BLIS_PRECISION_BIT ); } diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index e8343d804..b49f17c6a 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -112,6 +112,16 @@ static bool_t bli_is_double_prec( num_t dt ) bli_is_dcomplex( dt ) ); } +static dom_t bli_dt_domain( num_t dt ) +{ + return ( dt & BLIS_DOMAIN_BIT ); +} + +static prec_t bli_dt_prec( num_t dt ) +{ + return ( dt & BLIS_PRECISION_BIT ); +} + static num_t bli_dt_proj_to_real( num_t dt ) { return ( dt & ~BLIS_BITVAL_COMPLEX );