mirror of
https://github.com/amd/blis.git
synced 2026-05-11 01:30:00 +00:00
Merge branch 'amd' into rt
Details: - Merged contributions made by AMD via 'amd' branch (see summary below). Special thanks to AMD for their contributions to-date, especially with regard to intrinsic- and assembly-based kernels. - Added column storage output cases to microkernels in bli_gemm_zen_asm_d6x8.c and bli_gemmtrsm_l_zen_asm_d6x8.c. Even with the extra cost of transposing the microtile in registers, this is much faster than using the general storage case when the underlying matrix is column-stored. - Added s and d assembly-based zen gemmtrsm_u microkernel (including column storage optimization mentioned above). - Updated zen sub-configuration to reflect presence of new native kernels. - Temporarily reverted zen sub-configuration's level-3 cache blocksizes to smaller haswell values. - Temporarily disabled small matrix handling for zen configuration family in config/zen/bli_family_zen.h. - Updated zen CFLAGS according to changes in1e4365b. - Updated haswell microkernels such that: - only one vzeroupper instruction is called prior to returning - movapd/movupd are used in leiu of movaps/movups for double-real microkernels. (Note that single-real microkernels still use movaps/movups.) - Added kernel prototypes to kernels/zen/bli_kernels_zen.h, which is now included via frame/include/bli_arch_config.h. - Minor updates to bli_amaxv_ref.c (and to inlined "test" implementation in testsuite/src/test_amaxv.c). - Added early return for alpha == 0 in bli_dotxv_ref.c. - Integrated changes fromf07b176, including a fix for undefined behavior when executing the 1m method under certain conditions. - Updated config_registry; no longer need haswell kernels for zen sub-configuration. - Tweaked marginal and pass thresholds for dotxf. - Reformatted level-1v, -1f, and -3 amd kernels and inserted additional comments. - Updated LICENSE file to explicitly mention that parts are copyright UT-Austin and AMD. - Added AMD copyright to header templates in build/templates. Summary of previous changes from 'amd' branch. - Added s and d assembly-based zen gemm microkernels (d6x8 and d8x6) and s and d assembly-based zen gemmtrsm_l microkernels (d6x8). - Added s and d intrinsics-based zen kernels for amaxv, axpyv, dotv, dotxv, and scalv, with extra-unrolling variants for axpyv and scalv. - Added a small matrix handler to bli_gemm_front(), with the handler implemented in kernels/zen/3/bli_gemm_small_matrix.c. - Added additional logic to sumsqv that first attempts to compute the sum of the squares via dotv(). If there is a floating-point exception (FE_OVERFLOW), then the previous (numerically conservative) code is used; otherwise, the result of dotv() is square-rooted and stored as the result. This new implementation is only enabled when FE_OVERFLOW is #defined. If the macro is not #defined, then the previous implementation is used. - Added axpyv and dotv standalone test drivers to test directory. - Added zen support to old cpuid_x86.c driver in build/auto-detect/old. - Added thread-local and __attribute__-related macros to bli_macro_defs.h.
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -33,6 +34,7 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <fenv.h>
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with typed operands.
|
||||
@@ -293,7 +295,129 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
|
||||
//INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
|
||||
GENTFUNCR( scomplex, float, c, s, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
GENTFUNCR( dcomplex, double, z, d, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
|
||||
#undef GENTFUNCR
|
||||
#ifdef FE_OVERFLOW
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype_r* norm, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* zero = PASTEMAC(chr,0); \
|
||||
ctype_r* one = PASTEMAC(chr,1); \
|
||||
ctype_r scale; \
|
||||
ctype_r sumsq; \
|
||||
ctype_r sqrt_sumsq; \
|
||||
\
|
||||
/* Initialize scale and sumsq to begin the summation. */ \
|
||||
PASTEMAC(chr,copys)( *zero, scale ); \
|
||||
PASTEMAC(chr,copys)( *one, sumsq ); \
|
||||
\
|
||||
/* An optimization: first try to use dotv to compute the sum of
|
||||
the squares of the vector. If no floating-point exceptions
|
||||
(specifically, overflow and invalid exceptions) were produced,
|
||||
then we accept the computed value and returne early. The cost
|
||||
of this optimization is the "sunk" cost of the initial dotv
|
||||
when sumsqv must be used instead. However, we expect that the
|
||||
vast majority of use cases will not produce exceptions, and
|
||||
therefore only one pass through the data, via dotv, will be
|
||||
required. */ \
|
||||
if ( TRUE ) \
|
||||
{ \
|
||||
int f_exp_raised;\
|
||||
ctype sumsqc; \
|
||||
\
|
||||
feclearexcept( FE_ALL_EXCEPT );\
|
||||
\
|
||||
PASTEMAC(ch,dotv)\
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n,\
|
||||
x, incx, \
|
||||
x, incx, \
|
||||
&sumsqc, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \
|
||||
\
|
||||
f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
|
||||
\
|
||||
if ( !f_exp_raised ) \
|
||||
{ \
|
||||
PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \
|
||||
return; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Compute the sum of the squares of the vector. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
n, \
|
||||
x, incx, \
|
||||
&scale, \
|
||||
&sumsq, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Compute: norm = scale * sqrt( sumsq ) */ \
|
||||
PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
|
||||
PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
|
||||
\
|
||||
/* Store the final value to the output variable. */ \
|
||||
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
|
||||
}
|
||||
#else
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype_r* norm, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* zero = PASTEMAC(chr,0); \
|
||||
ctype_r* one = PASTEMAC(chr,1); \
|
||||
ctype_r scale; \
|
||||
ctype_r sumsq; \
|
||||
ctype_r sqrt_sumsq; \
|
||||
\
|
||||
/* Initialize scale and sumsq to begin the summation. */ \
|
||||
PASTEMAC(chr,copys)( *zero, scale ); \
|
||||
PASTEMAC(chr,copys)( *one, sumsq ); \
|
||||
\
|
||||
/* Compute the sum of the squares of the vector. */ \
|
||||
\
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
n, \
|
||||
x, incx, \
|
||||
&scale, \
|
||||
&sumsq, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Compute: norm = scale * sqrt( sumsq ) */ \
|
||||
PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
|
||||
PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
|
||||
\
|
||||
/* Store the final value to the output variable. */ \
|
||||
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
|
||||
}
|
||||
#endif
|
||||
GENTFUNCR( float, float, s, s, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
GENTFUNCR( double, double, d, d, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
|
||||
|
||||
#undef GENTFUNCR
|
||||
@@ -898,7 +1022,7 @@ void PASTEMAC(ch,varname) \
|
||||
n_elem - 1, \
|
||||
&beta, \
|
||||
x0, incx, \
|
||||
cntx \
|
||||
cntx \
|
||||
); \
|
||||
*/ \
|
||||
} \
|
||||
@@ -937,7 +1061,7 @@ void PASTEMAC(ch,varname) \
|
||||
n_elem - 1, \
|
||||
&beta, \
|
||||
x2, incx, \
|
||||
cntx \
|
||||
cntx \
|
||||
); \
|
||||
*/ \
|
||||
} \
|
||||
|
||||
Reference in New Issue
Block a user