Files
blis/test/test_axpyv.c
Field G. Van Zee 16813335bd Merge branch 'amd' into rt
Details:
- Merged contributions made by AMD via 'amd' branch (see summary below).
  Special thanks to AMD for their contributions to-date, especially with
  regard to intrinsic- and assembly-based kernels.
- Added column storage output cases to microkernels in
  bli_gemm_zen_asm_d6x8.c and bli_gemmtrsm_l_zen_asm_d6x8.c. Even with
  the extra cost of transposing the microtile in registers, this is
  much faster than using the general storage case when the underlying
  matrix is column-stored.
- Added s and d assembly-based zen gemmtrsm_u microkernel (including
  column storage optimization mentioned above).
- Updated zen sub-configuration to reflect presence of new native
  kernels.
- Temporarily reverted zen sub-configuration's level-3 cache blocksizes
  to smaller haswell values.
- Temporarily disabled small matrix handling for zen configuration
  family in config/zen/bli_family_zen.h.
- Updated zen CFLAGS according to changes in 1e4365b.
- Updated haswell microkernels such that:
  - only one vzeroupper instruction is called prior to returning
  - movapd/movupd are used in leiu of movaps/movups for double-real
    microkernels. (Note that single-real microkernels still use
    movaps/movups.)
- Added kernel prototypes to kernels/zen/bli_kernels_zen.h, which is
  now included via frame/include/bli_arch_config.h.
- Minor updates to bli_amaxv_ref.c (and to inlined "test" implementation
  in testsuite/src/test_amaxv.c).
- Added early return for alpha == 0 in bli_dotxv_ref.c.
- Integrated changes from f07b176, including a fix for undefined
  behavior when executing the 1m method under certain conditions.
- Updated config_registry; no longer need haswell kernels for zen
  sub-configuration.
- Tweaked marginal and pass thresholds for dotxf.
- Reformatted level-1v, -1f, and -3 amd kernels and inserted additional
  comments.
- Updated LICENSE file to explicitly mention that parts are copyright
  UT-Austin and AMD.
- Added AMD copyright to header templates in build/templates.

Summary of previous changes from 'amd' branch.
- Added s and d assembly-based zen gemm microkernels (d6x8 and d8x6) and
  s and d assembly-based zen gemmtrsm_l microkernels (d6x8).
- Added s and d intrinsics-based zen kernels for amaxv, axpyv, dotv, dotxv,
  and scalv, with extra-unrolling variants for axpyv and scalv.
- Added a small matrix handler to bli_gemm_front(), with the handler
  implemented in kernels/zen/3/bli_gemm_small_matrix.c.
- Added additional logic to sumsqv that first attempts to compute the
  sum of the squares via dotv(). If there is a floating-point exception
  (FE_OVERFLOW), then the previous (numerically conservative) code is
  used; otherwise, the result of dotv() is square-rooted and stored as
  the result. This new implementation is only enabled when FE_OVERFLOW
  is #defined. If the macro is not #defined, then the previous
  implementation is used.
- Added axpyv and dotv standalone test drivers to test directory.
- Added zen support to old cpuid_x86.c driver in build/auto-detect/old.
- Added thread-local and __attribute__-related macros to bli_macro_defs.h.
2018-02-21 17:43:32 -06:00

193 lines
4.6 KiB
C

/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2017, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include "blis.h"
// n alpha x incx y incy
//void daxpyv_( int*, double*, double*, int*, double*, int* );
//#define PRINT
int main( int argc, char** argv )
{
obj_t x, y;
obj_t y_save;
obj_t alpha;
dim_t n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int n_input;
num_t dt_x, dt_y;
num_t dt_alpha;
int r, n_repeats;
num_t dt;
double dtime;
double dtime_save;
double gflops;
bli_init();
n_repeats = 3;
#ifndef PRINT
p_begin = 40;
p_end = 4000;
p_inc = 40;
n_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
n_input = 15;
#endif
#if 1
dt = BLIS_FLOAT;
//dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
dt_x = dt_y = dt_alpha = dt;
for ( p = p_begin; p <= p_end; p += p_inc )
{
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
bli_obj_create( dt_x, n, 1, 0, 0, &x );
bli_obj_create( dt_y, n, 1, 0, 0, &y );
bli_obj_create( dt_y, n, 1, 0, 0, &y_save );
bli_randm( &x );
bli_randm( &y );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_copym( &y, &y_save );
dtime_save = 1.0e9;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &y_save, &y );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "alpha", &alpha, "%4.1f", "" );
bli_printm( "x", &x, "%4.1f", "" );
bli_printm( "y", &y, "%4.1f", "" );
#endif
#ifdef BLIS
bli_axpyv( &alpha,
&x,
&y );
#else
if ( bli_is_float( dt ) )
{
f77_int nn = bli_obj_length( x );
f77_int incx = bli_obj_vector_inc( x );
f77_int incy = bli_obj_vector_inc( y );
float* alphap = bli_obj_buffer( alpha );
float* xp = bli_obj_buffer( x );
float* yp = bli_obj_buffer( y );
saxpy_( &nn,
alphap,
xp, &incx,
yp, &incy );
}
else if ( bli_is_double( dt ) )
{
f77_int nn = bli_obj_length( x );
f77_int incx = bli_obj_vector_inc( x );
f77_int incy = bli_obj_vector_inc( y );
double* alphap = bli_obj_buffer( alpha );
double* xp = bli_obj_buffer( x );
double* yp = bli_obj_buffer( y );
daxpy_( &nn,
alphap,
xp, &incx,
yp, &incy );
}
#endif
#ifdef PRINT
bli_printm( "y after", &y, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 );
#ifdef BLIS
printf( "data_axpyv_blis" );
#else
printf( "data_axpyv_%s", BLAS );
#endif
printf( "( %2lu, 1:4 ) = [ %4lu %10.3e %6.3f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )n, dtime_save, gflops );
bli_obj_free( &alpha );
bli_obj_free( &x );
bli_obj_free( &y );
bli_obj_free( &y_save );
}
bli_finalize();
return 0;
}