Merge branch 'amd' into rt

Details:
- Merged contributions made by AMD via 'amd' branch (see summary below).
  Special thanks to AMD for their contributions to-date, especially with
  regard to intrinsic- and assembly-based kernels.
- Added column storage output cases to microkernels in
  bli_gemm_zen_asm_d6x8.c and bli_gemmtrsm_l_zen_asm_d6x8.c. Even with
  the extra cost of transposing the microtile in registers, this is
  much faster than using the general storage case when the underlying
  matrix is column-stored.
- Added s and d assembly-based zen gemmtrsm_u microkernel (including
  column storage optimization mentioned above).
- Updated zen sub-configuration to reflect presence of new native
  kernels.
- Temporarily reverted zen sub-configuration's level-3 cache blocksizes
  to smaller haswell values.
- Temporarily disabled small matrix handling for zen configuration
  family in config/zen/bli_family_zen.h.
- Updated zen CFLAGS according to changes in 1e4365b.
- Updated haswell microkernels such that:
  - only one vzeroupper instruction is called prior to returning
  - movapd/movupd are used in leiu of movaps/movups for double-real
    microkernels. (Note that single-real microkernels still use
    movaps/movups.)
- Added kernel prototypes to kernels/zen/bli_kernels_zen.h, which is
  now included via frame/include/bli_arch_config.h.
- Minor updates to bli_amaxv_ref.c (and to inlined "test" implementation
  in testsuite/src/test_amaxv.c).
- Added early return for alpha == 0 in bli_dotxv_ref.c.
- Integrated changes from f07b176, including a fix for undefined
  behavior when executing the 1m method under certain conditions.
- Updated config_registry; no longer need haswell kernels for zen
  sub-configuration.
- Tweaked marginal and pass thresholds for dotxf.
- Reformatted level-1v, -1f, and -3 amd kernels and inserted additional
  comments.
- Updated LICENSE file to explicitly mention that parts are copyright
  UT-Austin and AMD.
- Added AMD copyright to header templates in build/templates.

Summary of previous changes from 'amd' branch.
- Added s and d assembly-based zen gemm microkernels (d6x8 and d8x6) and
  s and d assembly-based zen gemmtrsm_l microkernels (d6x8).
- Added s and d intrinsics-based zen kernels for amaxv, axpyv, dotv, dotxv,
  and scalv, with extra-unrolling variants for axpyv and scalv.
- Added a small matrix handler to bli_gemm_front(), with the handler
  implemented in kernels/zen/3/bli_gemm_small_matrix.c.
- Added additional logic to sumsqv that first attempts to compute the
  sum of the squares via dotv(). If there is a floating-point exception
  (FE_OVERFLOW), then the previous (numerically conservative) code is
  used; otherwise, the result of dotv() is square-rooted and stored as
  the result. This new implementation is only enabled when FE_OVERFLOW
  is #defined. If the macro is not #defined, then the previous
  implementation is used.
- Added axpyv and dotv standalone test drivers to test directory.
- Added zen support to old cpuid_x86.c driver in build/auto-detect/old.
- Added thread-local and __attribute__-related macros to bli_macro_defs.h.
This commit is contained in:
Field G. Van Zee
2018-02-21 17:43:32 -06:00
46 changed files with 17161 additions and 255 deletions

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2017, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -33,6 +34,7 @@
*/
#include "blis.h"
#include <fenv.h>
//
// Define BLAS-like interfaces with typed operands.
@@ -293,7 +295,129 @@ void PASTEMAC(ch,varname) \
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
}
INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
//INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
GENTFUNCR( scomplex, float, c, s, normfv_unb_var1, sumsqv_unb_var1 )
GENTFUNCR( dcomplex, double, z, d, normfv_unb_var1, sumsqv_unb_var1 )
#undef GENTFUNCR
#ifdef FE_OVERFLOW
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
dim_t n, \
ctype* x, inc_t incx, \
ctype_r* norm, \
cntx_t* cntx \
) \
{ \
ctype_r* zero = PASTEMAC(chr,0); \
ctype_r* one = PASTEMAC(chr,1); \
ctype_r scale; \
ctype_r sumsq; \
ctype_r sqrt_sumsq; \
\
/* Initialize scale and sumsq to begin the summation. */ \
PASTEMAC(chr,copys)( *zero, scale ); \
PASTEMAC(chr,copys)( *one, sumsq ); \
\
/* An optimization: first try to use dotv to compute the sum of
the squares of the vector. If no floating-point exceptions
(specifically, overflow and invalid exceptions) were produced,
then we accept the computed value and returne early. The cost
of this optimization is the "sunk" cost of the initial dotv
when sumsqv must be used instead. However, we expect that the
vast majority of use cases will not produce exceptions, and
therefore only one pass through the data, via dotv, will be
required. */ \
if ( TRUE ) \
{ \
int f_exp_raised;\
ctype sumsqc; \
\
feclearexcept( FE_ALL_EXCEPT );\
\
PASTEMAC(ch,dotv)\
( \
BLIS_NO_CONJUGATE, \
BLIS_NO_CONJUGATE, \
n,\
x, incx, \
x, incx, \
&sumsqc, \
cntx \
); \
\
PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \
\
f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
\
if ( !f_exp_raised ) \
{ \
PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \
return; \
} \
} \
\
/* Compute the sum of the squares of the vector. */ \
PASTEMAC(ch,kername) \
( \
n, \
x, incx, \
&scale, \
&sumsq, \
cntx \
); \
\
/* Compute: norm = scale * sqrt( sumsq ) */ \
PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
\
/* Store the final value to the output variable. */ \
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
}
#else
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
dim_t n, \
ctype* x, inc_t incx, \
ctype_r* norm, \
cntx_t* cntx \
) \
{ \
ctype_r* zero = PASTEMAC(chr,0); \
ctype_r* one = PASTEMAC(chr,1); \
ctype_r scale; \
ctype_r sumsq; \
ctype_r sqrt_sumsq; \
\
/* Initialize scale and sumsq to begin the summation. */ \
PASTEMAC(chr,copys)( *zero, scale ); \
PASTEMAC(chr,copys)( *one, sumsq ); \
\
/* Compute the sum of the squares of the vector. */ \
\
PASTEMAC(ch,kername) \
( \
n, \
x, incx, \
&scale, \
&sumsq, \
cntx \
); \
\
/* Compute: norm = scale * sqrt( sumsq ) */ \
PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
\
/* Store the final value to the output variable. */ \
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
}
#endif
GENTFUNCR( float, float, s, s, normfv_unb_var1, sumsqv_unb_var1 )
GENTFUNCR( double, double, d, d, normfv_unb_var1, sumsqv_unb_var1 )
#undef GENTFUNCR
@@ -898,7 +1022,7 @@ void PASTEMAC(ch,varname) \
n_elem - 1, \
&beta, \
x0, incx, \
cntx \
cntx \
); \
*/ \
} \
@@ -937,7 +1061,7 @@ void PASTEMAC(ch,varname) \
n_elem - 1, \
&beta, \
x2, incx, \
cntx \
cntx \
); \
*/ \
} \