Rewrote reference kernels to use #pragma omp simd.

Details:
- Rewrote level-1v, -1f, and -3 reference kernels in terms of simplified
  indexing annotated by the #pragma omp simd directive, which a compiler
  can use to vectorize certain constant-bounded loops. (The new kernels
  actually use _Pragma("omp simd") since the kernels are defined via
  templatizing macros.) Modest speedup was observed in most cases using
  gcc 5.4.0, which may improve with newer versions. Thanks to Devin
  Matthews for suggesting this via issue #286 and #259.
- Updated default blocksizes defined in ref_kernels/bli_cntx_ref.c to
  be 4x16, 4x8, 4x8, and 4x4 for single, double, scomplex and dcomplex,
  respectively, with a default row preference for the gemm ukernel. Also
  updated axpyf, dotxf, and dotxaxpyf fusing factors to 8, 6, and 4,
  respectively, for all datatypes.
- Modified configure to verify that -fopenmp-simd is a valid compiler
  option (via a new detect/omp_simd/omp_simd_detect.c file).
- Added a new header in which prefetch macros are defined according to
  which compiler is detected (via macros such as __GNUC__). These
  prefetch macros are not yet employed anywhere, though.
- Updated the year in copyrights of template license headers in
  build/templates and removed AMD as a default copyright holder.
This commit is contained in:
Field G. Van Zee
2019-01-24 17:23:18 -06:00
parent 63de2b0090
commit bdd46f9ee8
36 changed files with 1769 additions and 400 deletions

View File

@@ -112,6 +112,9 @@ DEBUG_TYPE := @debug_type@
# The requested threading model.
THREADING_MODEL := @threading_model@
# Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option.
PRAGMA_OMP_SIMD := @pragma_omp_simd@
# The install libdir, includedir, and shareddir values from configure tell
# us where to install the libraries, header files, and public makefile
# fragments, respectively. Notice that we support the use of DESTDIR so that

View File

@@ -0,0 +1,35 @@
#include <stdio.h>
#include <string.h>
#define ARRAY_LEN 4096
double x[ ARRAY_LEN ];
double y[ ARRAY_LEN ];
int main( int argc, char **argv )
{
const double alpha = 2.1;
for ( int i = 0; i < ARRAY_LEN; ++i )
{
y[ i ] = 0.0;
x[ i ] = 1.0;
}
#pragma omp simd
for ( int i = 0; i < ARRAY_LEN; ++i )
{
y[ i ] += alpha * x[ i ];
}
#if 0
_Pragma( "omp simd" )
for ( int i = 0; i < ARRAY_LEN; ++i )
{
x[ i ] += alpha * y[ i ];
}
#endif
return 0;
}

View File

@@ -4,8 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2017, Advanced Micro Devices, Inc.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,8 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2017, Advanced Micro Devices, Inc.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,8 +4,7 @@
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2017, Advanced Micro Devices, Inc.
# Copyright (C) 2019, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are

View File

@@ -124,6 +124,7 @@ get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
$(call load-var-for,CRVECFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(COMPSIMDFLAGS) \
-DBLIS_CNAME=$(1) \
$(BUILD_FLAGS) \
)
@@ -635,7 +636,6 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c))))
# --- Threading flags ---
ifeq ($(CC_VENDOR),gcc)
@@ -680,6 +680,14 @@ LDFLAGS += $(LIBPTHREAD)
endif
endif
# --- #pragma omp simd flags (used for reference kernels only) ---
ifeq ($(PRAGMA_OMP_SIMD),yes)
COMPSIMDFLAGS := -fopenmp-simd
else
COMPSIMDFLAGS :=
endif
#
@@ -948,7 +956,8 @@ BLIS_CONFIG_H := ./bli_config.h
VERS_DEF := -DBLIS_VERSION_STRING=\"$(VERSION)\"
# Define a C preprocessor flag that is *only* defined when BLIS is being
# compiled.
# compiled. (In other words, an application that #includes blis.h will not
# get this cpp macro.)
BUILD_FLAGS := -DBLIS_IS_BUILDING_LIBRARY

39
configure vendored
View File

@@ -1081,6 +1081,36 @@ has_libmemkind()
echo "${rval}"
}
has_pragma_omp_simd()
{
local main_c main_c_filepath binname rval
# Path to omp-simd detection source file.
main_c="omp_simd_detect.c"
main_c_filepath=$(find ${dist_path}/build -name "${main_c}")
# Binary executable filename.
binname="omp_simd-detect.x"
# Attempt to compile a simple main() program that contains a
# #pragma omp simd.
${found_cc} -std=c99 -O3 -march=native -fopenmp-simd \
-o ${binname} ${main_c_filepath} 2> /dev/null
# Depending on the return code from the compile step above, we set
# enable_memkind accordingly.
if [ "$?" == 0 ]; then
rval='yes'
else
rval='no'
fi
# Remove the executable generated above.
rm -f ./${binname}
echo "${rval}"
}
echoerr()
{
printf "${script_name}: error: %s\n" "$*" #>&2;
@@ -2423,6 +2453,9 @@ main()
# --without-memkind.
has_memkind=$(has_libmemkind)
# Try to determine whether the chosen compiler supports #pragma omp simd.
pragma_omp_simd=$(has_pragma_omp_simd)
# -- Prepare variables for subsitution into template files -----------------
@@ -2633,6 +2666,11 @@ main()
enable_memkind="no"
enable_memkind_01=0
fi
if [ "x${pragma_omp_simd}" = "xyes" ]; then
echo "${script_name}: compiler appears to support #pragma omp simd."
else
echo "${script_name}: compiler appears to not support #pragma omp simd."
fi
if [ "x${enable_blas}" = "xyes" ]; then
echo "${script_name}: the BLAS compatibility layer is enabled."
enable_blas_01=1
@@ -2842,6 +2880,7 @@ main()
| sed -e "s/@enable_blas@/${enable_blas}/g" \
| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
| sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \
| sed -e "s/@sandbox@/${sandbox}/g" \
> "${config_mk_out_path}"

View File

@@ -49,6 +49,10 @@
#define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME)
#endif
// Combine the CNAME and _ref for convenience to the code that defines
// reference kernels.
//#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX)
// -- Prototype-generating macro definitions -----------------------------------
// Prototype-generating macro for bli_cntx_init_<arch>*() functions.

View File

@@ -0,0 +1,59 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_BUILTIN_MACRO_DEFS_H
#define BLIS_BUILTIN_MACRO_DEFS_H
#if defined(__ICC) || defined(__INTEL_COMPILER)
// icc
#define bli_prefetch( addr, rw, loc )
#elif defined(__clang__)
// clang
#define bli_prefetch( addr, rw, loc )
#elif defined(__GNUC__)
// gcc
#define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc );
#endif
#endif

View File

@@ -128,6 +128,15 @@
#define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op
#define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op)
#define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op
#define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op)
#define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op
#define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op)
#define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op
#define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op)
#define PASTEBLACHK_(op) bla_ ## op ## _check
#define PASTEBLACHK(op) PASTEBLACHK_(op)
@@ -163,6 +172,7 @@
#include "bli_scalar_macro_defs.h"
#include "bli_error_macro_defs.h"
#include "bli_blas_macro_defs.h"
#include "bli_builtin_macro_defs.h"
#include "bli_oapi_macro_defs.h"
#include "bli_tapi_macro_defs.h"

View File

@@ -45,36 +45,46 @@
#include <errno.h>
#include <ctype.h>
// Determine the compiler (hopefully) and define conveniently named macros
// accordingly.
#if defined(__ICC) || defined(__INTEL_COMPILER)
#define BLIS_ICC
#elif defined(__clang__)
#define BLIS_CLANG
#elif defined(__GNUC__)
#define BLIS_GCC
#endif
// Determine if we are on a 64-bit or 32-bit architecture.
#if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \
defined(_ARCH_PPC64)
#define BLIS_ARCH_64
#define BLIS_ARCH_64
#else
#define BLIS_ARCH_32
#define BLIS_ARCH_32
#endif
// Determine the target operating system.
#if defined(_WIN32) || defined(__CYGWIN__)
#define BLIS_OS_WINDOWS 1
#define BLIS_OS_WINDOWS 1
#elif defined(__gnu_hurd__)
#define BLIS_OS_GNU 1
#define BLIS_OS_GNU 1
#elif defined(__APPLE__) || defined(__MACH__)
#define BLIS_OS_OSX 1
#define BLIS_OS_OSX 1
#elif defined(__ANDROID__)
#define BLIS_OS_ANDROID 1
#define BLIS_OS_ANDROID 1
#elif defined(__linux__)
#define BLIS_OS_LINUX 1
#define BLIS_OS_LINUX 1
#elif defined(__bgq__)
#define BLIS_OS_BGQ 1
#define BLIS_OS_BGQ 1
#elif defined(__bg__)
#define BLIS_OS_BGP 1
#define BLIS_OS_BGP 1
#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
defined(__bsdi__) || defined(__DragonFly__)
#define BLIS_OS_BSD 1
#define BLIS_OS_BSD 1
#elif defined(EMSCRIPTEN)
#define BLIS_OS_EMSCRIPTEN
#define BLIS_OS_EMSCRIPTEN
#else
#error "Cannot determine operating system"
#error "Cannot determine operating system"
#endif
// A few changes that may be necessary in Windows environments.
@@ -86,11 +96,11 @@
#include <windows.h>
#if !defined(__clang__) && !defined(__GNUC__)
// Undefine attribute specifiers in Windows.
#define __attribute__(x)
// Undefine attribute specifiers in Windows.
#define __attribute__(x)
// Undefine restrict.
#define restrict
// Undefine restrict.
#define restrict
#endif
#endif

View File

@@ -46,27 +46,24 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
chi1 = x; \
psi1 = y; \
ctype* restrict chi1 = x; \
ctype* restrict psi1 = y; \
\
if ( bli_is_conj( conjx ) ) \
{ \
if (incx == 1 && incy == 1) \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,addjs)( chi1[i], psi1[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,addjs)( *chi1, *psi1 ); \
\
@@ -77,16 +74,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else \
{ \
if (incx == 1 && incy == 1) \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,adds)( chi1[i], psi1[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,adds)( *chi1, *psi1 ); \
\

View File

@@ -57,7 +57,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
ctype_r abs_chi1; \
ctype_r abs_chi1_max; \
dim_t i_max_l; \
dim_t i; \
\
/* If the vector length is zero, return early. This directly emulates
the behavior of netlib BLAS's i?amax() routines. */ \
@@ -79,7 +78,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
ctype* chi1 = x; \
\
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
/* Get the real and imaginary components of chi1. */ \
PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
@@ -109,7 +108,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
ctype* chi1 = x + (i )*incx; \
\

View File

@@ -48,17 +48,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
/* If alpha is zero and beta is zero, set to zero. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* If alpha is zero and beta is zero, set to zero. */ \
\
ctype* zero = PASTEMAC(ch,0); \
\
/* Query the context for the kernel function pointer. */ \
@@ -75,14 +72,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
/* If alpha is zero and beta is one, return. */ \
else if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
/* If alpha is zero and beta is one, return. */ \
return; \
} \
/* If alpha is zero, scale by beta. */ \
else \
{ \
/* If alpha is zero, scale by beta. */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); \
@@ -101,9 +99,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else if ( PASTEMAC(ch,eq1)( *alpha ) ) \
{ \
/* If alpha is one and beta is zero, copy. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* If alpha is one and beta is zero, use copyv. */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
@@ -118,9 +117,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
/* If alpha is one and beta is one, add. */ \
else if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
/* If alpha is one and beta is one, use addv. */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
@@ -135,9 +135,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
/* If alpha is one, call xpby. */ \
else \
{ \
/* If alpha is one and beta is something else, use xpbyv. */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_XPBYV_KER, cntx ); \
@@ -156,9 +157,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
else \
{ \
/* If beta is zero, call scal2. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* If alpha is something else and beta is zero, use scal2v. */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx ); \
@@ -174,9 +176,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
/* If beta is one, call axpy. */ \
else if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
/* If alpha is something else and beta is one, use axpyv. */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
@@ -192,29 +195,28 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
\
} \
\
chi1 = x; \
psi1 = y; \
/* If execution reaches here, alpha and beta are both non-zero/non-unit. */ \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpbyjs)( *alpha, chi1[i], *beta, psi1[i] ); \
PASTEMAC(ch,axpbyjs)( *alpha, x[i], *beta, y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpbyjs)( *alpha, *chi1, *beta, *psi1 ); \
PASTEMAC(ch,axpbyjs)( *alpha, *x, *beta, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \
@@ -222,19 +224,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpbys)( *alpha, chi1[i], *beta, psi1[i] ); \
PASTEMAC(ch,axpbys)( *alpha, x[i], *beta, y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpbys)( *alpha, *chi1, *beta, *psi1 ); \
PASTEMAC(ch,axpbys)( *alpha, *x, *beta, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \

View File

@@ -34,6 +34,7 @@
#include "blis.h"
#if 0
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
@@ -47,10 +48,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
/* If alpha is zero, return. */ \
@@ -74,21 +71,23 @@ void PASTEMAC3(ch,opname,arch,suf) \
return; \
} \
\
chi1 = x; \
psi1 = y; \
ctype* restrict chi1 = x; \
ctype* restrict psi1 = y; \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( *alpha, chi1[i], psi1[i] ); \
/*PASTEMAC(ch,axpyjs)( *alpha, chi1[i], psi1[i] );*/ \
psi1[i] = fma( *alpha, chi1[i], psi1[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( *alpha, *chi1, *psi1 ); \
\
@@ -101,14 +100,16 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, chi1[i], psi1[i] ); \
/*PASTEMAC(ch,axpys)( *alpha, chi1[i], psi1[i] );*/ \
psi1[i] = fma( *alpha, chi1[i], psi1[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, *chi1, *psi1 ); \
\
@@ -119,5 +120,90 @@ void PASTEMAC3(ch,opname,arch,suf) \
} \
}
//INSERT_GENTFUNC_BASIC2( axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( double, d, axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
#endif
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
) \
{ \
if ( bli_zero_dim1( n ) ) return; \
\
/* If alpha is zero, return. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* If alpha is one, use addv. */ \
if ( PASTEMAC(ch,eq1)( *alpha ) ) \
{ \
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
\
addv_p \
( \
conjx, \
n, \
x, incx, \
y, incy, \
cntx \
); \
return; \
} \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( *alpha, x[i], y[i] ); \
} \
} \
else \
{ \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( *alpha, *x, *y ); \
\
x += incx; \
y += incy; \
} \
} \
} \
else \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, x[i], y[i] ); \
} \
} \
else \
{ \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, *x, *y ); \
\
x += incx; \
y += incy; \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )

View File

@@ -46,32 +46,26 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
chi1 = x; \
psi1 = y; \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copyjs)( chi1[i], psi1[i] ); \
PASTEMAC(ch,copyjs)( x[i], y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copyjs)( *chi1, *psi1 ); \
PASTEMAC(ch,copyjs)( *x, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \
@@ -79,19 +73,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copys)( chi1[i], psi1[i] ); \
PASTEMAC(ch,copys)( x[i], y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copys)( *chi1, *psi1 ); \
PASTEMAC(ch,copys)( *x, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \

View File

@@ -48,11 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
ctype dotxy; \
dim_t i; \
conj_t conjx_use; \
ctype dotxy; \
\
if ( bli_zero_dim1( n ) ) \
{ \
@@ -62,10 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
PASTEMAC(ch,set0s)( dotxy ); \
\
chi1 = x; \
psi1 = y; \
\
conjx_use = conjx; \
conj_t conjx_use = conjx; \
\
/* If y must be conjugated, we do so indirectly by first toggling the
effective conjugation of x and then conjugating the resulting dot
@@ -77,19 +70,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dotjs)( chi1[i], psi1[i], dotxy ); \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dotjs)( *chi1, *psi1, dotxy ); \
PASTEMAC(ch,dotjs)( *x, *y, dotxy ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \
@@ -97,19 +91,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dots)( chi1[i], psi1[i], dotxy ); \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dots)( *chi1, *psi1, dotxy ); \
PASTEMAC(ch,dots)( *x, *y, dotxy ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \

View File

@@ -50,11 +50,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
ctype dotxy; \
dim_t i; \
conj_t conjx_use; \
ctype dotxy; \
\
/* If beta is zero, clear rho. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
@@ -70,14 +66,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
if ( bli_zero_dim1( n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
PASTEMAC(ch,set0s)( dotxy ); \
\
chi1 = x; \
psi1 = y; \
\
/* If y must be conjugated, we do so indirectly by first toggling the
effective conjugation of x and then conjugating the resulting dot
product. */ \
conjx_use = conjx; \
conj_t conjx_use = conjx; \
\
if ( bli_is_conj( conjy ) ) \
bli_toggle_conj( &conjx_use ); \
@@ -86,19 +79,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dotjs)( chi1[i], psi1[i], dotxy ); \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dotjs)( *chi1, *psi1, dotxy ); \
PASTEMAC(ch,dotjs)( *x, *y, dotxy ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \
@@ -106,19 +100,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dots)( chi1[i], psi1[i], dotxy ); \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,dots)( *chi1, *psi1, dotxy ); \
PASTEMAC(ch,dots)( *x, *y, dotxy ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \

View File

@@ -44,27 +44,23 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
chi1 = x; \
\
if ( incx == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,inverts)( chi1[i] ); \
PASTEMAC(ch,inverts)( x[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,inverts)( *chi1 ); \
\
chi1 += incx; \
PASTEMAC(ch,inverts)( *x ); \
\
x += incx; \
} \
} \
}

View File

@@ -47,15 +47,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
/* If alpha is zero, use setv. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
/* If alpha is zero, use setv. */ \
\
ctype* zero = PASTEMAC(ch,0); \
\
/* Query the context for the kernel function pointer. */ \
@@ -72,9 +69,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
/* If alpha is one, use copyv. */ \
else if ( PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
/* If alpha is one, use copyv. */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
@@ -89,27 +87,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
\
chi1 = x; \
psi1 = y; \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scal2js)( *alpha, chi1[i], psi1[i] ); \
PASTEMAC(ch,scal2js)( *alpha, x[i], y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scal2js)( *alpha, *chi1, *psi1 ); \
PASTEMAC(ch,scal2js)( *alpha, *x, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \
@@ -117,19 +113,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, chi1[i], psi1[i] ); \
PASTEMAC(ch,scal2s)( *alpha, x[i], y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, *chi1, *psi1 ); \
PASTEMAC(ch,scal2s)( *alpha, *x, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \

View File

@@ -46,10 +46,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype alpha_conj; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
/* If alpha is one, return. */ \
@@ -75,24 +71,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
return; \
} \
\
PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \
ctype alpha_conj; \
\
chi1 = x; \
PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \
\
if ( incx == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scals)( alpha_conj, chi1[i] ); \
PASTEMAC(ch,scals)( alpha_conj, x[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,scals)( alpha_conj, *chi1 ); \
PASTEMAC(ch,scals)( alpha_conj, *x ); \
\
chi1 += incx; \
x += incx; \
} \
} \
}

View File

@@ -46,51 +46,49 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype alpha_conj; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
chi1 = x; \
\
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
if ( incx == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,set0s)( chi1[i] ); \
PASTEMAC(ch,set0s)( x[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,set0s)( *chi1 ); \
PASTEMAC(ch,set0s)( *x ); \
\
chi1 += incx; \
x += incx; \
} \
} \
} \
else \
{ \
ctype alpha_conj; \
\
PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \
\
if ( incx == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copys)( alpha_conj, chi1[i] ); \
PASTEMAC(ch,copys)( alpha_conj, x[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copys)( alpha_conj, *chi1 ); \
PASTEMAC(ch,copys)( alpha_conj, *x ); \
\
chi1 += incx; \
x += incx; \
} \
} \
} \

View File

@@ -46,32 +46,26 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
chi1 = x; \
psi1 = y; \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,subjs)( chi1[i], psi1[i] ); \
PASTEMAC(ch,subjs)( x[i], y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,subjs)( *chi1, *psi1 ); \
PASTEMAC(ch,subjs)( *x, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \
@@ -79,19 +73,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,subs)( chi1[i], psi1[i] ); \
PASTEMAC(ch,subs)( x[i], y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,subs)( *chi1, *psi1 ); \
PASTEMAC(ch,subs)( *x, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \

View File

@@ -45,30 +45,24 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
chi1 = x; \
psi1 = y; \
\
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,swaps)( chi1[i], psi1[i] ); \
PASTEMAC(ch,swaps)( x[i], y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,swaps)( *chi1, *psi1 ); \
PASTEMAC(ch,swaps)( *x, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
}

View File

@@ -47,10 +47,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* restrict chi1; \
ctype* restrict psi1; \
dim_t i; \
\
if ( bli_zero_dim1( n ) ) return; \
\
/* If beta is zero, use copyv. */ \
@@ -87,27 +83,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
\
chi1 = x; \
psi1 = y; \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,xpbyjs)( chi1[i], *beta, psi1[i] ); \
PASTEMAC(ch,xpbyjs)( x[i], *beta, y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,xpbyjs)( *chi1, *beta, *psi1 ); \
PASTEMAC(ch,xpbyjs)( *x, *beta, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \
@@ -115,19 +109,20 @@ void PASTEMAC3(ch,opname,arch,suf) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,xpbys)( chi1[i], *beta, psi1[i] ); \
PASTEMAC(ch,xpbys)( x[i], *beta, y[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,xpbys)( *chi1, *beta, *psi1 ); \
PASTEMAC(ch,xpbys)( *x, *beta, *y ); \
\
chi1 += incx; \
psi1 += incy; \
x += incx; \
y += incy; \
} \
} \
} \

View File

@@ -51,29 +51,87 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,axpyv_ker_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
if ( bli_zero_dim1( n ) ) return; \
\
kfp_av \
( \
conjx, \
n, \
alphax, \
x, incx, \
z, incz, \
cntx \
); \
if ( incz == 1 && incx == 1 && incy == 1 ) \
{ \
ctype chic, psic; \
\
kfp_av \
( \
conjy, \
n, \
alphay, \
y, incy, \
z, incz, \
cntx \
); \
if ( bli_is_noconj( conjx ) ) \
{ \
if ( bli_is_noconj( conjy ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \
PASTEMAC(ch,axpys)( *alphay, y[i], z[i] ); \
} \
} \
else /* if ( bli_is_conj( conjy ) ) */ \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \
PASTEMAC(ch,copyjs)( y[i], psic ); \
PASTEMAC(ch,axpys)( *alphay, psic, z[i] ); \
} \
} \
} \
else /* if ( bli_is_conj( conjx ) ) */ \
{ \
if ( bli_is_noconj( conjy ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copyjs)( x[i], chic ); \
PASTEMAC(ch,axpys)( *alphax, chic, z[i] ); \
PASTEMAC(ch,axpys)( *alphay, y[i], z[i] ); \
} \
} \
else /* if ( bli_is_conj( conjy ) ) */ \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < n; ++i ) \
{ \
PASTEMAC(ch,copyjs)( x[i], chic ); \
PASTEMAC(ch,axpys)( *alphax, chic, z[i] ); \
PASTEMAC(ch,copyjs)( y[i], psic ); \
PASTEMAC(ch,axpys)( *alphay, psic, z[i] ); \
} \
} \
} \
} \
else \
{ \
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,axpyv_ker_ft) kfp_av \
= \
bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
kfp_av \
( \
conjx, \
n, \
alphax, \
x, incx, \
z, incz, \
cntx \
); \
\
kfp_av \
( \
conjy, \
n, \
alphay, \
y, incy, \
z, incz, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC2( axpy2v, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )

View File

@@ -36,7 +36,7 @@
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
#define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
@@ -51,36 +51,81 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* a1; \
ctype* chi1; \
ctype* y1; \
ctype alpha_chi1; \
dim_t i; \
if ( bli_zero_dim1( m ) ) return; \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,axpyv_ker_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < b_n; ++i ) \
if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \
{ \
a1 = a + (0 )*inca + (i )*lda; \
chi1 = x + (i )*incx; \
y1 = y + (0 )*incy; \
ctype ax[ ff ]; \
\
PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \
PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \
/* Scale x by alpha, storing to a temporary array ax. */ \
if ( bli_is_conj( conjx ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t j = 0; j < ff; ++j ) \
PASTEMAC(ch,scal2js)( *alpha, x[j], ax[j] ); \
} \
else \
{ \
_Pragma( "omp simd" ) \
for ( dim_t j = 0; j < ff; ++j ) \
PASTEMAC(ch,scal2s)( *alpha, x[j], ax[j] ); \
} \
\
kfp_av \
( \
conja, \
m, \
&alpha_chi1, \
a1, inca, \
y1, incy, \
cntx \
); \
/* Accumulate ff separate axpyv's into y. */ \
if ( bli_is_noconj( conja ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < m; ++i ) \
for ( dim_t j = 0; j < ff; ++j ) \
{ \
PASTEMAC(ch,axpys)( ax[j], a[i + j*lda], y[i] ); \
} \
} \
else \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < m; ++i ) \
for ( dim_t j = 0; j < ff; ++j ) \
{ \
PASTEMAC(ch,axpyjs)( ax[j], a[i + j*lda], y[i] ); \
} \
} \
} \
else \
{ \
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,axpyv_ker_ft) kfp_av \
= \
bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( dim_t i = 0; i < b_n; ++i ) \
{ \
ctype* restrict a1 = a + (0 )*inca + (i )*lda; \
ctype* restrict chi1 = x + (i )*incx; \
ctype* restrict y1 = y + (0 )*incy; \
\
ctype alpha_chi1; \
\
PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \
PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \
\
kfp_av \
( \
conja, \
m, \
&alpha_chi1, \
a1, inca, \
y1, incy, \
cntx \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC2( axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
//INSERT_GENTFUNC_BASIC2( axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 )
GENTFUNC( double, d, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 )
GENTFUNC( scomplex, c, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 )
GENTFUNC( dcomplex, z, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 )

View File

@@ -52,36 +52,112 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
if ( bli_zero_dim1( m ) ) return; \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,dotxv_ker_ft) kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
PASTECH(ch,axpyv_ker_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
if ( incz == 1 && incx == 1 && incy == 1 ) \
{ \
if ( bli_is_noconj( conjx ) ) \
{ \
conj_t conjxt_use = conjxt; \
ctype dotxy; \
\
kfp_dv \
( \
conjxt, \
conjy, \
m, \
one, \
x, incx, \
y, incy, \
zero, \
rho, \
cntx \
); \
PASTEMAC(ch,set0s)( dotxy ); \
\
kfp_av \
( \
conjx, \
m, \
alpha, \
x, incx, \
z, incz, \
cntx \
); \
if ( bli_is_conj( conjy ) ) \
bli_toggle_conj( &conjxt_use ); \
\
if ( bli_is_noconj( conjxt_use ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
PASTEMAC(ch,axpys)( *alpha, x[i], z[i] ); \
} \
} \
else /* bli_is_conj( conjxt_use ) ) */ \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
PASTEMAC(ch,axpys)( *alpha, x[i], z[i] ); \
} \
} \
\
if ( bli_is_conj( conjy ) ) \
PASTEMAC(ch,conjs)( dotxy ); \
\
PASTEMAC(ch,copys)( dotxy, *rho ); \
} \
else /* bli_is_conj( conjx ) ) */ \
{ \
conj_t conjxt_use = conjxt; \
ctype dotxy; \
\
PASTEMAC(ch,set0s)( dotxy ); \
\
if ( bli_is_conj( conjy ) ) \
bli_toggle_conj( &conjxt_use ); \
\
if ( bli_is_noconj( conjxt_use ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \
PASTEMAC(ch,axpyjs)( *alpha, x[i], z[i] ); \
} \
} \
else /* bli_is_conj( conjxt_use ) ) */ \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \
PASTEMAC(ch,axpyjs)( *alpha, x[i], z[i] ); \
} \
} \
\
if ( bli_is_conj( conjy ) ) \
PASTEMAC(ch,conjs)( dotxy ); \
\
PASTEMAC(ch,copys)( dotxy, *rho ); \
} \
} \
else \
{ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,dotv_ker_ft) kfp_dv \
= \
bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
PASTECH(ch,axpyv_ker_ft) kfp_av \
= \
bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
kfp_dv \
( \
conjxt, \
conjy, \
m, \
x, incx, \
y, incy, \
rho, \
cntx \
); \
\
kfp_av \
( \
conjx, \
m, \
alpha, \
x, incx, \
z, incz, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC2( dotaxpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )

View File

@@ -36,7 +36,7 @@
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
#define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
@@ -60,38 +60,147 @@ void PASTEMAC3(ch,opname,arch,suf) \
/* y = beta * y + alpha * A^T w; */ \
/* z = z + alpha * A x; */ \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,dotxf_ker_ft) kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
PASTECH(ch,axpyf_ker_ft) kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
if ( 1 && inca == 1 && incw == 1 && incx == 1 && \
incy == 1 && incz == 1 && b_n == ff ) \
{ \
ctype r[ ff ]; \
ctype ax[ ff ]; \
\
kfp_df \
( \
conjat, \
conjw, \
m, \
b_n, \
alpha, \
a, inca, lda, \
w, incw, \
beta, \
y, incy, \
cntx \
); \
/* If beta is zero, clear y. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( y[i] ); \
} \
else \
{ \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \
} \
\
kfp_af \
( \
conja, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
z, incz, \
cntx \
); \
/* If the vectors are empty or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Initialize r vector to 0. */ \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \
\
/* Scale x by alpha, storing to a temporary array ax. */ \
if ( bli_is_conj( conjx ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < ff; ++i ) \
PASTEMAC(ch,scal2js)( *alpha, x[i], ax[i] ); \
} \
else \
{ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < ff; ++i ) \
PASTEMAC(ch,scal2s)( *alpha, x[i], ax[i] ); \
} \
\
/* If a must be conjugated, we do so indirectly by first toggling the
effective conjugation of w and then conjugating the resulting dot
products. */ \
conj_t conjw_use = conjw; \
\
if ( bli_is_conj( conjat ) ) \
bli_toggle_conj( &conjw_use ); \
\
if ( bli_is_noconj( conjw_use ) ) \
{ \
if ( bli_is_noconj( conja ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \
PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \
} \
} \
else \
{ \
_Pragma( "omp simd" ) \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \
PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \
} \
} \
} \
else \
{ \
if ( bli_is_noconj( conja ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \
PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \
} \
} \
else \
{ \
_Pragma( "omp simd" ) \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \
PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \
} \
} \
} \
\
if ( bli_is_conj( conjat ) ) \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,conjs)( r[i] ); \
\
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \
} \
} \
else \
{ \
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,dotxf_ker_ft) kfp_df \
= \
bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
PASTECH(ch,axpyf_ker_ft) kfp_af \
= \
bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
\
kfp_df \
( \
conjat, \
conjw, \
m, \
b_n, \
alpha, \
a, inca, lda, \
w, incw, \
beta, \
y, incy, \
cntx \
); \
\
kfp_af \
( \
conja, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
z, incz, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC2( dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
//INSERT_GENTFUNC_BASIC2( dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
GENTFUNC( double, d, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
GENTFUNC( scomplex, c, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
GENTFUNC( dcomplex, z, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )

View File

@@ -36,7 +36,7 @@
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
#define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
@@ -52,35 +52,94 @@ void PASTEMAC3(ch,opname,arch,suf) \
cntx_t* restrict cntx \
) \
{ \
ctype* a1; \
ctype* x1; \
ctype* psi1; \
dim_t i; \
\
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,dotxv_ker_ft) kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < b_n; ++i ) \
if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \
{ \
a1 = a + (0 )*inca + (i )*lda; \
x1 = x + (0 )*incx; \
psi1 = y + (i )*incy; \
ctype r[ ff ]; \
\
kfp_dv \
( \
conjat, \
conjx, \
m, \
alpha, \
a1, inca, \
x1, incx, \
beta, \
psi1, \
cntx \
); \
/* If beta is zero, clear y. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( y[i] ); \
} \
else \
{ \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \
} \
\
/* If the vectors are empty or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Initialize r vector to 0. */ \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \
\
/* If a must be conjugated, we do so indirectly by first toggling the
effective conjugation of x and then conjugating the resulting dot
products. */ \
conj_t conjx_use = conjx; \
\
if ( bli_is_conj( conjat ) ) \
bli_toggle_conj( &conjx_use ); \
\
if ( bli_is_noconj( conjx_use ) ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpys)( a[p + i*lda], x[p], r[i] ); \
} \
} \
else \
{ \
_Pragma( "omp simd" ) \
for ( dim_t p = 0; p < m; ++p ) \
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpyjs)( a[p + i*lda], x[p], r[i] ); \
} \
} \
\
if ( bli_is_conj( conjat ) ) \
for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,conjs)( r[i] ); \
\
for ( dim_t i = 0; i < ff; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \
} \
} \
else \
{ \
/* Query the context for the kernel function pointer. */ \
const num_t dt = PASTEMAC(ch,type); \
PASTECH(ch,dotxv_ker_ft) kfp_dv \
= \
bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( dim_t i = 0; i < b_n; ++i ) \
{ \
ctype* restrict a1 = a + (0 )*inca + (i )*lda; \
ctype* restrict x1 = x + (0 )*incx; \
ctype* restrict psi1 = y + (i )*incy; \
\
kfp_dv \
( \
conjat, \
conjx, \
m, \
alpha, \
a1, inca, \
x1, incx, \
beta, \
psi1, \
cntx \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC2( dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
//INSERT_GENTFUNC_BASIC2( dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 )
GENTFUNC( double, d, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 )
GENTFUNC( scomplex, c, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 )
GENTFUNC( dcomplex, z, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 )

View File

@@ -34,6 +34,138 @@
#include "blis.h"
#if 1
// An implementation that attempts to facilitate emission of vectorized
// instructions via constant loop bounds + #pragma omp simd directives.
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
ctype ab[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ab = nr; \
const inc_t cs_ab = 1; \
\
const inc_t cs_a = mr; \
const inc_t rs_b = nr; \
\
\
/* Initialize the accumulator elements in ab to zero. */ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,set0s)( ab[ i ] ); \
} \
\
/* Perform a series of k rank-1 updates into ab. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
PASTEMAC(ch,dots) \
( \
a[ i ], \
b[ j ], \
ab[ i*rs_ab + j*cs_ab ] \
); \
} \
} \
\
a += cs_a; \
b += rs_b; \
} \
\
/* Scale the result in ab by alpha. */ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,scals)( *alpha, ab[ i ] ); \
} \
\
/* Output/accumulate intermediate result ab based on the storage
of c and the value of beta. */ \
if ( cs_c == 1 ) \
{ \
/* C is row-stored. */ \
\
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
for ( dim_t j = 0; j < nr; ++j ) \
PASTEMAC(ch,copys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
c [ i*rs_c + j*1 ] \
); \
} \
else \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
for ( dim_t j = 0; j < nr; ++j ) \
PASTEMAC(ch,xpbys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
*beta, \
c [ i*rs_c + j*1 ] \
); \
} \
} \
else \
{ \
/* C is column-stored or general-stored. */ \
\
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( dim_t j = 0; j < nr; ++j ) \
for ( dim_t i = 0; i < mr; ++i ) \
PASTEMAC(ch,copys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
c [ i*rs_c + j*cs_c ] \
); \
} \
else \
{ \
for ( dim_t j = 0; j < nr; ++j ) \
for ( dim_t i = 0; i < mr; ++i ) \
PASTEMAC(ch,xpbys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
*beta, \
c [ i*rs_c + j*cs_c ] \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 )
GENTFUNC( double, d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 )
#else
// An implementation that uses variable loop bounds (queried from the context)
// and makes no use of #pragma omp simd.
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
@@ -134,3 +266,5 @@ void PASTEMAC3(ch,opname,arch,suf) \
INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
#endif

View File

@@ -34,6 +34,137 @@
#include "blis.h"
#if 1
// An implementation that attempts to facilitate emission of vectorized
// instructions via constant loop bounds + #pragma omp simd directives.
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const inc_t rs_a = 1; \
const inc_t cs_a = mr; \
\
const inc_t rs_b = nr; \
const inc_t cs_b = 1; \
\
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
/* b1 = b1 - a10t * B0; */ \
/* b1 = b1 / alpha11; */ \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
ctype beta11c = b[i*rs_b + j*cs_b]; \
ctype rho11; \
\
/* beta11 = beta11 - a10t * b01; */ \
PASTEMAC(ch,set0s)( rho11 ); \
for ( dim_t l = 0; l < i; ++l ) \
{ \
PASTEMAC(ch,axpys)( a[i*rs_a + l*cs_a], \
b[l*rs_b + j*cs_b], rho11 ); \
} \
PASTEMAC(ch,subs)( rho11, beta11c ); \
\
/* beta11 = beta11 / alpha11; */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( a[i*rs_a + i*cs_a], beta11c ); \
\
/* Output final result to matrix c. */ \
PASTEMAC(ch,copys)( beta11c, c[i*rs_c + j*cs_c] ); \
\
/* Store the local value back to b11. */ \
PASTEMAC(ch,copys)( beta11c, b[i*rs_b + j*cs_b] ); \
} \
} \
}
//INSERT_GENTFUNC_BASIC2( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 )
GENTFUNC( double, d, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( scomplex, c, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( dcomplex, z, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \
\
void PASTEMAC3(ch,opname,arch,suf) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const inc_t rs_a = 1; \
const inc_t cs_a = mr; \
\
const inc_t rs_b = nr; \
const inc_t cs_b = 1; \
\
_Pragma( "omp simd" ) \
for ( dim_t iter = 0; iter < mr; ++iter ) \
{ \
dim_t i = mr - iter - 1; \
\
/* b1 = b1 - a12t * B2; */ \
/* b1 = b1 / alpha11; */ \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
ctype beta11c = b[i*rs_b + j*cs_b]; \
ctype rho11; \
\
/* beta11 = beta11 - a12t * b21; */ \
PASTEMAC(ch,set0s)( rho11 ); \
for ( dim_t l = 0; l < iter; ++l ) \
{ \
PASTEMAC(ch,axpys)( a[i*rs_a + (i+1+l)*cs_a], \
b[(i+1+l)*rs_b + j*cs_b], rho11 ); \
} \
PASTEMAC(ch,subs)( rho11, beta11c ); \
\
/* beta11 = beta11 / alpha11; */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( a[i*rs_a + i*cs_a], beta11c ); \
\
/* Output final result to matrix c. */ \
PASTEMAC(ch,copys)( beta11c, c[i*rs_c + j*cs_c] ); \
\
/* Store the local value back to b11. */ \
PASTEMAC(ch,copys)( beta11c, b[i*rs_b + j*cs_b] ); \
} \
} \
}
//INSERT_GENTFUNC_BASIC2( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 )
GENTFUNC( double, d, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( scomplex, c, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( dcomplex, z, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 )
#else
// An implementation that uses variable loop bounds (queried from the context)
// and makes no use of #pragma omp simd.
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf ) \
\
@@ -99,7 +230,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
/* beta11 = beta11 / alpha11; */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
@@ -181,7 +312,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
\
/* beta11 = beta11 / alpha11; */ \
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */ \
PASTEMAC(ch,scals)( *alpha11, beta11c ); \
@@ -197,3 +328,4 @@ void PASTEMAC3(ch,opname,arch,suf) \
INSERT_GENTFUNC_BASIC2( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
#endif

View File

@@ -0,0 +1,190 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \
\
void PASTEMAC4(ch,opname,arch,_simd,suf) \
( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
ctype ab[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ab = nr; \
const inc_t cs_ab = 1; \
\
const inc_t cs_a = mr; \
const inc_t rs_b = nr; \
\
\
/* Initialize the accumulator elements in ab to zero. */ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,set0s)( ab[ i ] ); \
} \
\
/*
const dim_t pre = 16; \
dim_t k16; \
if ( k >= pre ) { k16 = k - pre; k = pre; } \
else { k16 = 0; } \
\
for ( dim_t l = 0; l < k16; ++l ) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
PASTEMAC(ch,dots) \
( \
a[ i ], \
b[ j ], \
ab[ i*rs_ab + j*cs_ab ] \
); \
} \
} \
\
a += cs_a; \
b += rs_b; \
} \
\
__builtin_prefetch( c + 0*cs_c, 1, 0 ); \
__builtin_prefetch( c + 1*cs_c, 1, 0 ); \
__builtin_prefetch( c + 2*cs_c, 1, 0 ); \
__builtin_prefetch( c + 3*cs_c, 1, 0 ); \
*/ \
\
/* Perform a series of k rank-1 updates into ab. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
{ \
_Pragma( "omp simd" ) \
for ( dim_t j = 0; j < nr; ++j ) \
{ \
PASTEMAC(ch,dots) \
( \
a[ i ], \
b[ j ], \
ab[ i*rs_ab + j*cs_ab ] \
); \
} \
} \
\
a += cs_a; \
b += rs_b; \
} \
\
/* Scale the result in ab by alpha. */ \
_Pragma( "omp simd" ) \
for ( dim_t i = 0; i < mr * nr; ++i ) \
{ \
PASTEMAC(ch,scals)( *alpha, ab[ i ] ); \
} \
\
/* Output/accumulate intermediate result ab based on the storage
of c and the value of beta. */ \
if ( cs_c == 1 ) \
{ \
/* C is row-stored. */ \
\
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
for ( dim_t j = 0; j < nr; ++j ) \
PASTEMAC(ch,copys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
c [ i*rs_c + j*1 ] \
); \
} \
else \
{ \
for ( dim_t i = 0; i < mr; ++i ) \
for ( dim_t j = 0; j < nr; ++j ) \
PASTEMAC(ch,xpbys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
*beta, \
c [ i*rs_c + j*1 ] \
); \
} \
} \
else \
{ \
/* C is column-stored or general-stored. */ \
\
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( dim_t j = 0; j < nr; ++j ) \
for ( dim_t i = 0; i < mr; ++i ) \
PASTEMAC(ch,copys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
c [ i*rs_c + j*1 ] \
); \
} \
else \
{ \
for ( dim_t j = 0; j < nr; ++j ) \
for ( dim_t i = 0; i < mr; ++i ) \
PASTEMAC(ch,xpbys) \
( \
ab[ i*rs_ab + j*cs_ab ], \
*beta, \
c [ i*rs_c + j*1 ] \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
GENTFUNC( float, s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 )
GENTFUNC( double, d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 )

View File

@@ -0,0 +1,348 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// -- dgemm --------------------------------------------------------------------
//
#undef CH
#define CH d
#undef CTYPE
#define CTYPE double
#undef ZERO
#define ZERO 0.0
#undef MR
#define MR 4
#undef NR
#define NR 8
//void PASTEMAC4(CH,gemm,BLIS_CNAME_INFIX,BLIS_REF_SUF,_4x8)
void PASTEMAC6(CH,gemm,BLIS_CNAME_REF_SUFFIX,_,MR,x,NR)
(
dim_t k,
CTYPE* restrict alpha,
CTYPE* restrict a,
CTYPE* restrict b,
CTYPE* restrict beta,
CTYPE* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
const dim_t cs_a = MR;
const dim_t rs_b = NR;
CTYPE ab00 = ZERO, ab01 = ZERO, ab02 = ZERO, ab03 = ZERO;
CTYPE ab10 = ZERO, ab11 = ZERO, ab12 = ZERO, ab13 = ZERO;
CTYPE ab20 = ZERO, ab21 = ZERO, ab22 = ZERO, ab23 = ZERO;
CTYPE ab30 = ZERO, ab31 = ZERO, ab32 = ZERO, ab33 = ZERO;
CTYPE ab04 = ZERO, ab05 = ZERO, ab06 = ZERO, ab07 = ZERO;
CTYPE ab14 = ZERO, ab15 = ZERO, ab16 = ZERO, ab17 = ZERO;
CTYPE ab24 = ZERO, ab25 = ZERO, ab26 = ZERO, ab27 = ZERO;
CTYPE ab34 = ZERO, ab35 = ZERO, ab36 = ZERO, ab37 = ZERO;
// Perform a series of k rank-1 updates into ab.
for ( ; k != 0; --k )
{
const CTYPE a0 = a[0];
ab00 += a0*b[0]; ab01 += a0*b[1]; ab02 += a0*b[2]; ab03 += a0*b[3];
ab04 += a0*b[4]; ab05 += a0*b[5]; ab06 += a0*b[6]; ab07 += a0*b[7];
const CTYPE a1 = a[1];
ab10 += a1*b[0]; ab11 += a1*b[1]; ab12 += a1*b[2]; ab13 += a1*b[3];
ab14 += a1*b[4]; ab15 += a1*b[5]; ab16 += a1*b[6]; ab17 += a1*b[7];
const CTYPE a2 = a[2];
ab20 += a2*b[0]; ab21 += a2*b[1]; ab22 += a2*b[2]; ab23 += a2*b[3];
ab24 += a2*b[4]; ab25 += a2*b[5]; ab26 += a2*b[6]; ab27 += a2*b[7];
const CTYPE a3 = a[3];
ab30 += a3*b[0]; ab31 += a3*b[1]; ab32 += a3*b[2]; ab33 += a3*b[3];
ab34 += a3*b[4]; ab35 += a3*b[5]; ab36 += a3*b[6]; ab37 += a3*b[7];
a += cs_a;
b += rs_b;
}
// Scale each element of ab by alpha.
if ( !PASTEMAC(CH,eq1)( *alpha ) )
{
const CTYPE alpha0 = *alpha;
PASTEMAC(CH,scals)( alpha0, ab00 );
PASTEMAC(CH,scals)( alpha0, ab01 );
PASTEMAC(CH,scals)( alpha0, ab02 );
PASTEMAC(CH,scals)( alpha0, ab02 );
PASTEMAC(CH,scals)( alpha0, ab04 );
PASTEMAC(CH,scals)( alpha0, ab05 );
PASTEMAC(CH,scals)( alpha0, ab06 );
PASTEMAC(CH,scals)( alpha0, ab07 );
PASTEMAC(CH,scals)( alpha0, ab10 );
PASTEMAC(CH,scals)( alpha0, ab11 );
PASTEMAC(CH,scals)( alpha0, ab12 );
PASTEMAC(CH,scals)( alpha0, ab12 );
PASTEMAC(CH,scals)( alpha0, ab14 );
PASTEMAC(CH,scals)( alpha0, ab15 );
PASTEMAC(CH,scals)( alpha0, ab16 );
PASTEMAC(CH,scals)( alpha0, ab17 );
PASTEMAC(CH,scals)( alpha0, ab20 );
PASTEMAC(CH,scals)( alpha0, ab21 );
PASTEMAC(CH,scals)( alpha0, ab22 );
PASTEMAC(CH,scals)( alpha0, ab22 );
PASTEMAC(CH,scals)( alpha0, ab24 );
PASTEMAC(CH,scals)( alpha0, ab25 );
PASTEMAC(CH,scals)( alpha0, ab26 );
PASTEMAC(CH,scals)( alpha0, ab27 );
PASTEMAC(CH,scals)( alpha0, ab30 );
PASTEMAC(CH,scals)( alpha0, ab31 );
PASTEMAC(CH,scals)( alpha0, ab32 );
PASTEMAC(CH,scals)( alpha0, ab32 );
PASTEMAC(CH,scals)( alpha0, ab34 );
PASTEMAC(CH,scals)( alpha0, ab35 );
PASTEMAC(CH,scals)( alpha0, ab36 );
PASTEMAC(CH,scals)( alpha0, ab37 );
}
// Output/accumulate intermediate result ab based on the storage
// of c and the value of beta.
if ( cs_c == 1 )
{
// C is row-stored.
if ( PASTEMAC(CH,eq0)( *beta ) )
{
// beta == 0:
// c := ab
PASTEMAC(CH,copys)( ab00, c[ 0*rs_c + 0 ] );
PASTEMAC(CH,copys)( ab01, c[ 0*rs_c + 1 ] );
PASTEMAC(CH,copys)( ab02, c[ 0*rs_c + 2 ] );
PASTEMAC(CH,copys)( ab03, c[ 0*rs_c + 3 ] );
PASTEMAC(CH,copys)( ab04, c[ 0*rs_c + 4 ] );
PASTEMAC(CH,copys)( ab05, c[ 0*rs_c + 5 ] );
PASTEMAC(CH,copys)( ab06, c[ 0*rs_c + 6 ] );
PASTEMAC(CH,copys)( ab07, c[ 0*rs_c + 7 ] );
PASTEMAC(CH,copys)( ab10, c[ 1*rs_c + 0 ] );
PASTEMAC(CH,copys)( ab11, c[ 1*rs_c + 1 ] );
PASTEMAC(CH,copys)( ab12, c[ 1*rs_c + 2 ] );
PASTEMAC(CH,copys)( ab13, c[ 1*rs_c + 3 ] );
PASTEMAC(CH,copys)( ab14, c[ 1*rs_c + 4 ] );
PASTEMAC(CH,copys)( ab15, c[ 1*rs_c + 5 ] );
PASTEMAC(CH,copys)( ab16, c[ 1*rs_c + 6 ] );
PASTEMAC(CH,copys)( ab17, c[ 1*rs_c + 7 ] );
PASTEMAC(CH,copys)( ab20, c[ 2*rs_c + 0 ] );
PASTEMAC(CH,copys)( ab21, c[ 2*rs_c + 1 ] );
PASTEMAC(CH,copys)( ab22, c[ 2*rs_c + 2 ] );
PASTEMAC(CH,copys)( ab23, c[ 2*rs_c + 3 ] );
PASTEMAC(CH,copys)( ab24, c[ 2*rs_c + 4 ] );
PASTEMAC(CH,copys)( ab25, c[ 2*rs_c + 5 ] );
PASTEMAC(CH,copys)( ab26, c[ 2*rs_c + 6 ] );
PASTEMAC(CH,copys)( ab27, c[ 2*rs_c + 7 ] );
PASTEMAC(CH,copys)( ab30, c[ 3*rs_c + 0 ] );
PASTEMAC(CH,copys)( ab31, c[ 3*rs_c + 1 ] );
PASTEMAC(CH,copys)( ab32, c[ 3*rs_c + 2 ] );
PASTEMAC(CH,copys)( ab33, c[ 3*rs_c + 3 ] );
PASTEMAC(CH,copys)( ab34, c[ 3*rs_c + 4 ] );
PASTEMAC(CH,copys)( ab35, c[ 3*rs_c + 5 ] );
PASTEMAC(CH,copys)( ab36, c[ 3*rs_c + 6 ] );
PASTEMAC(CH,copys)( ab37, c[ 3*rs_c + 7 ] );
}
else
{
const CTYPE beta0 = *beta;
// beta != 0:
// c := beta * c + ab
PASTEMAC(CH,xpbys)( ab00, beta0, c[ 0*rs_c + 0 ] );
PASTEMAC(CH,xpbys)( ab01, beta0, c[ 0*rs_c + 1 ] );
PASTEMAC(CH,xpbys)( ab02, beta0, c[ 0*rs_c + 2 ] );
PASTEMAC(CH,xpbys)( ab03, beta0, c[ 0*rs_c + 3 ] );
PASTEMAC(CH,xpbys)( ab04, beta0, c[ 0*rs_c + 4 ] );
PASTEMAC(CH,xpbys)( ab05, beta0, c[ 0*rs_c + 5 ] );
PASTEMAC(CH,xpbys)( ab06, beta0, c[ 0*rs_c + 6 ] );
PASTEMAC(CH,xpbys)( ab07, beta0, c[ 0*rs_c + 7 ] );
PASTEMAC(CH,xpbys)( ab10, beta0, c[ 1*rs_c + 0 ] );
PASTEMAC(CH,xpbys)( ab11, beta0, c[ 1*rs_c + 1 ] );
PASTEMAC(CH,xpbys)( ab12, beta0, c[ 1*rs_c + 2 ] );
PASTEMAC(CH,xpbys)( ab13, beta0, c[ 1*rs_c + 3 ] );
PASTEMAC(CH,xpbys)( ab14, beta0, c[ 1*rs_c + 4 ] );
PASTEMAC(CH,xpbys)( ab15, beta0, c[ 1*rs_c + 5 ] );
PASTEMAC(CH,xpbys)( ab16, beta0, c[ 1*rs_c + 6 ] );
PASTEMAC(CH,xpbys)( ab17, beta0, c[ 1*rs_c + 7 ] );
PASTEMAC(CH,xpbys)( ab20, beta0, c[ 2*rs_c + 0 ] );
PASTEMAC(CH,xpbys)( ab21, beta0, c[ 2*rs_c + 1 ] );
PASTEMAC(CH,xpbys)( ab22, beta0, c[ 2*rs_c + 2 ] );
PASTEMAC(CH,xpbys)( ab23, beta0, c[ 2*rs_c + 3 ] );
PASTEMAC(CH,xpbys)( ab24, beta0, c[ 2*rs_c + 4 ] );
PASTEMAC(CH,xpbys)( ab25, beta0, c[ 2*rs_c + 5 ] );
PASTEMAC(CH,xpbys)( ab26, beta0, c[ 2*rs_c + 6 ] );
PASTEMAC(CH,xpbys)( ab27, beta0, c[ 2*rs_c + 7 ] );
PASTEMAC(CH,xpbys)( ab30, beta0, c[ 3*rs_c + 0 ] );
PASTEMAC(CH,xpbys)( ab31, beta0, c[ 3*rs_c + 1 ] );
PASTEMAC(CH,xpbys)( ab32, beta0, c[ 3*rs_c + 2 ] );
PASTEMAC(CH,xpbys)( ab33, beta0, c[ 3*rs_c + 3 ] );
PASTEMAC(CH,xpbys)( ab34, beta0, c[ 3*rs_c + 4 ] );
PASTEMAC(CH,xpbys)( ab35, beta0, c[ 3*rs_c + 5 ] );
PASTEMAC(CH,xpbys)( ab36, beta0, c[ 3*rs_c + 6 ] );
PASTEMAC(CH,xpbys)( ab37, beta0, c[ 3*rs_c + 7 ] );
}
}
else
{
// C is general-stored (or column-stored).
if ( PASTEMAC(CH,eq0)( *beta ) )
{
// beta == 0:
// c := ab
PASTEMAC(CH,copys)( ab00, c[ 0*rs_c + 0*cs_c ] );
PASTEMAC(CH,copys)( ab01, c[ 0*rs_c + 1*cs_c ] );
PASTEMAC(CH,copys)( ab02, c[ 0*rs_c + 2*cs_c ] );
PASTEMAC(CH,copys)( ab03, c[ 0*rs_c + 3*cs_c ] );
PASTEMAC(CH,copys)( ab04, c[ 0*rs_c + 4*cs_c ] );
PASTEMAC(CH,copys)( ab05, c[ 0*rs_c + 5*cs_c ] );
PASTEMAC(CH,copys)( ab06, c[ 0*rs_c + 6*cs_c ] );
PASTEMAC(CH,copys)( ab07, c[ 0*rs_c + 7*cs_c ] );
PASTEMAC(CH,copys)( ab10, c[ 1*rs_c + 0*cs_c ] );
PASTEMAC(CH,copys)( ab11, c[ 1*rs_c + 1*cs_c ] );
PASTEMAC(CH,copys)( ab12, c[ 1*rs_c + 2*cs_c ] );
PASTEMAC(CH,copys)( ab13, c[ 1*rs_c + 3*cs_c ] );
PASTEMAC(CH,copys)( ab14, c[ 1*rs_c + 4*cs_c ] );
PASTEMAC(CH,copys)( ab15, c[ 1*rs_c + 5*cs_c ] );
PASTEMAC(CH,copys)( ab16, c[ 1*rs_c + 6*cs_c ] );
PASTEMAC(CH,copys)( ab17, c[ 1*rs_c + 7*cs_c ] );
PASTEMAC(CH,copys)( ab20, c[ 2*rs_c + 0*cs_c ] );
PASTEMAC(CH,copys)( ab21, c[ 2*rs_c + 1*cs_c ] );
PASTEMAC(CH,copys)( ab22, c[ 2*rs_c + 2*cs_c ] );
PASTEMAC(CH,copys)( ab23, c[ 2*rs_c + 3*cs_c ] );
PASTEMAC(CH,copys)( ab24, c[ 2*rs_c + 4*cs_c ] );
PASTEMAC(CH,copys)( ab25, c[ 2*rs_c + 5*cs_c ] );
PASTEMAC(CH,copys)( ab26, c[ 2*rs_c + 6*cs_c ] );
PASTEMAC(CH,copys)( ab27, c[ 2*rs_c + 7*cs_c ] );
PASTEMAC(CH,copys)( ab30, c[ 3*rs_c + 0*cs_c ] );
PASTEMAC(CH,copys)( ab31, c[ 3*rs_c + 1*cs_c ] );
PASTEMAC(CH,copys)( ab32, c[ 3*rs_c + 2*cs_c ] );
PASTEMAC(CH,copys)( ab33, c[ 3*rs_c + 3*cs_c ] );
PASTEMAC(CH,copys)( ab34, c[ 3*rs_c + 4*cs_c ] );
PASTEMAC(CH,copys)( ab35, c[ 3*rs_c + 5*cs_c ] );
PASTEMAC(CH,copys)( ab36, c[ 3*rs_c + 6*cs_c ] );
PASTEMAC(CH,copys)( ab37, c[ 3*rs_c + 7*cs_c ] );
}
else
{
const CTYPE beta0 = *beta;
// beta != 0:
// c := beta * c + ab
PASTEMAC(CH,xpbys)( ab00, beta0, c[ 0*rs_c + 0*cs_c ] );
PASTEMAC(CH,xpbys)( ab01, beta0, c[ 0*rs_c + 1*cs_c ] );
PASTEMAC(CH,xpbys)( ab02, beta0, c[ 0*rs_c + 2*cs_c ] );
PASTEMAC(CH,xpbys)( ab03, beta0, c[ 0*rs_c + 3*cs_c ] );
PASTEMAC(CH,xpbys)( ab04, beta0, c[ 0*rs_c + 4*cs_c ] );
PASTEMAC(CH,xpbys)( ab05, beta0, c[ 0*rs_c + 5*cs_c ] );
PASTEMAC(CH,xpbys)( ab06, beta0, c[ 0*rs_c + 6*cs_c ] );
PASTEMAC(CH,xpbys)( ab07, beta0, c[ 0*rs_c + 7*cs_c ] );
PASTEMAC(CH,xpbys)( ab10, beta0, c[ 1*rs_c + 0*cs_c ] );
PASTEMAC(CH,xpbys)( ab11, beta0, c[ 1*rs_c + 1*cs_c ] );
PASTEMAC(CH,xpbys)( ab12, beta0, c[ 1*rs_c + 2*cs_c ] );
PASTEMAC(CH,xpbys)( ab13, beta0, c[ 1*rs_c + 3*cs_c ] );
PASTEMAC(CH,xpbys)( ab14, beta0, c[ 1*rs_c + 4*cs_c ] );
PASTEMAC(CH,xpbys)( ab15, beta0, c[ 1*rs_c + 5*cs_c ] );
PASTEMAC(CH,xpbys)( ab16, beta0, c[ 1*rs_c + 6*cs_c ] );
PASTEMAC(CH,xpbys)( ab17, beta0, c[ 1*rs_c + 7*cs_c ] );
PASTEMAC(CH,xpbys)( ab20, beta0, c[ 2*rs_c + 0*cs_c ] );
PASTEMAC(CH,xpbys)( ab21, beta0, c[ 2*rs_c + 1*cs_c ] );
PASTEMAC(CH,xpbys)( ab22, beta0, c[ 2*rs_c + 2*cs_c ] );
PASTEMAC(CH,xpbys)( ab23, beta0, c[ 2*rs_c + 3*cs_c ] );
PASTEMAC(CH,xpbys)( ab24, beta0, c[ 2*rs_c + 4*cs_c ] );
PASTEMAC(CH,xpbys)( ab25, beta0, c[ 2*rs_c + 5*cs_c ] );
PASTEMAC(CH,xpbys)( ab26, beta0, c[ 2*rs_c + 6*cs_c ] );
PASTEMAC(CH,xpbys)( ab27, beta0, c[ 2*rs_c + 7*cs_c ] );
PASTEMAC(CH,xpbys)( ab30, beta0, c[ 3*rs_c + 0*cs_c ] );
PASTEMAC(CH,xpbys)( ab31, beta0, c[ 3*rs_c + 1*cs_c ] );
PASTEMAC(CH,xpbys)( ab32, beta0, c[ 3*rs_c + 2*cs_c ] );
PASTEMAC(CH,xpbys)( ab33, beta0, c[ 3*rs_c + 3*cs_c ] );
PASTEMAC(CH,xpbys)( ab34, beta0, c[ 3*rs_c + 4*cs_c ] );
PASTEMAC(CH,xpbys)( ab35, beta0, c[ 3*rs_c + 5*cs_c ] );
PASTEMAC(CH,xpbys)( ab36, beta0, c[ 3*rs_c + 6*cs_c ] );
PASTEMAC(CH,xpbys)( ab37, beta0, c[ 3*rs_c + 7*cs_c ] );
}
}
}

View File

@@ -47,6 +47,8 @@
// -- Level-3 native micro-kernel prototype redefinitions ----------------------
// -- prototypes for completely generic level-3 microkernels --
#undef gemm_ukr_name
#define gemm_ukr_name GENARNAME(gemm)
#undef gemmtrsm_l_ukr_name
@@ -58,7 +60,8 @@
#undef trsm_u_ukr_name
#define trsm_u_ukr_name GENARNAME(trsm_u)
// Include the native micro-kernel API template.
// Instantiate prototypes for above functions via the native micro-kernel API
// template.
#include "bli_l3_ukr.h"
// -- Level-3 virtual micro-kernel prototype redefinitions ---------------------
@@ -117,7 +120,8 @@
#undef trsm1m_u_ukr_name
#define trsm1m_u_ukr_name GENARNAME(trsm1m_u)
// Include the virtual micro-kernel API template.
// Instantiate prototypes for above functions via the virtual micro-kernel API
// template.
#include "bli_l3_ind_ukr.h"
// -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------
@@ -230,7 +234,8 @@
#undef packm_16xk_1er_ker_name
#define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1er)
// Include the level-1m kernel API template.
// Instantiate prototypes for above functions via the level-1m kernel API
// template.
#include "bli_l1m_ker.h"
// -- Level-1f kernel prototype redefinitions ----------------------------------
@@ -246,11 +251,14 @@
#undef dotxaxpyf_ker_name
#define dotxaxpyf_ker_name GENARNAME(dotxaxpyf)
// Include the level-1f kernel API template.
// Instantiate prototypes for above functions via the level-1f kernel API
// template.
#include "bli_l1f_ker.h"
// -- Level-1v kernel prototype redefinitions ----------------------------------
// -- prototypes for completely generic level-1v kernels --
#undef addv_ker_name
#define addv_ker_name GENARNAME(addv)
#undef amaxv_ker_name
@@ -280,10 +288,10 @@
#undef xpbyv_ker_name
#define xpbyv_ker_name GENARNAME(xpbyv)
// Include the level-1v kernel API template.
// Instantiate prototypes for above functions via the level-1v kernel API
// template.
#include "bli_l1v_ker.h"
// -- Macros to help concisely instantiate bli_func_init() ---------------------
#define gen_func_init_co( func_p, opname ) \
@@ -297,6 +305,7 @@
PASTEMAC(c,opname), PASTEMAC(z,opname) )
// -----------------------------------------------------------------------------
void GENBARNAME(cntx_init)
@@ -319,16 +328,16 @@ void GENBARNAME(cntx_init)
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_KR ], 1, 1, 1, 1 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 4, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 2, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 256, 256, 128 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 4, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 );
bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 );
bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 4, 4, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 4, 4, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_XF ], 8, 4, 4, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 6, 6, 6, 6 );
bli_blksz_init_easy( &blkszs[ BLIS_XF ], 4, 4, 4, 4 );
// Initialize the context with the default blocksize objects and their
// multiples.
@@ -372,7 +381,7 @@ void GENBARNAME(cntx_init)
gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name );
gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name );
bli_mbool_init( &mbools[ BLIS_GEMM_UKR ], FALSE, FALSE, FALSE, FALSE );
bli_mbool_init( &mbools[ BLIS_GEMM_UKR ], TRUE, TRUE, TRUE, TRUE );
bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE );
bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE );
bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE );

View File

@@ -209,12 +209,12 @@ void libblis_test_dotaxpyv_experiment
}
else
{
bli_setsc( 0.0, -0.8, &alpha );
bli_setsc( 0.7, -0.1, &alpha );
}
// Randomize x and z, and save z.
libblis_test_vobj_randomize( params, FALSE, &x );
libblis_test_vobj_randomize( params, FALSE, &z );
libblis_test_vobj_randomize( params, TRUE, &x );
libblis_test_vobj_randomize( params, TRUE, &z );
bli_copyv( &z, &z_save );
// Create an alias to x for xt. (Note that it doesn't actually need to be