Development of AVX2 axpyv kernels for c and z datatypes.

Details
    - Added Framework optimizations for BLAS and CBLAS interfaces for caxpyv_(cblas_caxpyv) and zaxpyv_ (cblas_zaxpyv).
    - Added new axpyv AVX2 kernels for c and z data types for AMD EPYC family.

AMD-Internal: [CPUPL-1231]

Change-Id: I9bc0c21fef9da84533adcef76427977430b27ea7
This commit is contained in:
Nageshwar Singh
2020-10-13 00:14:41 +05:30
parent e0e0760ed6
commit dbd7b28373
7 changed files with 1734 additions and 971 deletions

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -42,168 +42,194 @@
// n alpha x incx y incy
//void daxpyv_( int*, double*, double*, int*, double*, int* );
//#define PRINT
// #define PRINT
int main( int argc, char** argv )
{
obj_t x, y;
obj_t y_save;
obj_t alpha;
dim_t n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int n_input;
num_t dt_x, dt_y;
num_t dt_alpha;
int r, n_repeats;
num_t dt;
obj_t x, y;
obj_t y_save;
obj_t alpha;
dim_t n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int n_input;
num_t dt_x, dt_y;
num_t dt_alpha;
int r, n_repeats;
num_t dt;
double dtime;
double dtime_save;
double gflops;
double dtime;
double dtime_save;
double gflops;
bli_init();
bli_init();
n_repeats = 3;
n_repeats = 1;
#ifndef PRINT
p_begin = 40;
p_end = 4000;
p_inc = 40;
p_begin = 10;
p_end = 100;
p_inc = 10;
n_input = -1;
n_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
p_begin = 16;
p_end = 16;
p_inc = 1;
n_input = 15;
n_input = 15;
#endif
#if 1
dt = BLIS_FLOAT;
//dt = BLIS_DOUBLE;
dt = BLIS_FLOAT;
//dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
// dt = BLIS_SCOMPLEX;
// dt = BLIS_DCOMPLEX;
#endif
dt_x = dt_y = dt_alpha = dt;
dt_x = dt_y = dt_alpha = dt;
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_axpyv_blis" );
printf( "data_axpyv_blis" );
#else
printf( "data_axpyv_%s", BLAS );
printf( "data_axpyv_%s", BLAS );
#endif
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0, 0.0 );
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_end; p_begin <= p; p -= p_inc )
{
//for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_end; p_begin <= p; p -= p_inc )
{
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
bli_obj_create( dt_x, n, 1, 0, 0, &x );
bli_obj_create( dt_y, n, 1, 0, 0, &y );
bli_obj_create( dt_y, n, 1, 0, 0, &y_save );
bli_obj_create( dt_x, n, 1, 0, 0, &x );
bli_obj_create( dt_y, n, 1, 0, 0, &y );
bli_obj_create( dt_y, n, 1, 0, 0, &y_save );
bli_randm( &x );
bli_randm( &y );
bli_randm( &x );
bli_randm( &y );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_copym( &y, &y_save );
bli_copym( &y, &y_save );
dtime_save = 1.0e9;
dtime_save = 1.0e9;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &y_save, &y );
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &y_save, &y );
dtime = bli_clock();
dtime = bli_clock();
#ifdef PRINT
bli_printm( "alpha", &alpha, "%4.1f", "" );
bli_printm( "x", &x, "%4.1f", "" );
bli_printm( "y", &y, "%4.1f", "" );
bli_printm( "alpha", &alpha, "%4.1f", "" );
bli_printm( "x", &x, "%4.1f", "" );
bli_printm( "y", &y, "%4.1f", "" );
#endif
#ifdef BLIS
bli_axpyv( &alpha,
&x,
&y );
bli_axpyv( &alpha,
&x,
&y );
#else
if ( bli_is_float( dt ) )
{
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
float* alphap = bli_obj_buffer( &alpha );
float* xp = bli_obj_buffer( &x );
float* yp = bli_obj_buffer( &y );
if ( bli_is_float( dt ) )
{
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
float* alphap = bli_obj_buffer( &alpha );
float* xp = bli_obj_buffer( &x );
float* yp = bli_obj_buffer( &y );
saxpy_( &nn,
alphap,
xp, &incx,
yp, &incy );
saxpy_( &nn,
alphap,
xp, &incx,
yp, &incy );
}
else if ( bli_is_double( dt ) )
{
}
else if ( bli_is_double( dt ) )
{
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
double* alphap = bli_obj_buffer( &alpha );
double* xp = bli_obj_buffer( &x );
double* yp = bli_obj_buffer( &y );
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
double* alphap = bli_obj_buffer( &alpha );
double* xp = bli_obj_buffer( &x );
double* yp = bli_obj_buffer( &y );
daxpy_( &nn,
alphap,
xp, &incx,
yp, &incy );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
void* alphap = bli_obj_buffer( &alpha );
void* xp = bli_obj_buffer( &x );
void* yp = bli_obj_buffer( &y );
daxpy_( &nn,
alphap,
xp, &incx,
yp, &incy );
}
caxpy_( &nn,
(scomplex*)alphap,
(scomplex*)xp, &incx,
(scomplex*)yp, &incy );
}
else if ( bli_is_dcomplex( dt ))
{
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
void* alphap = bli_obj_buffer( &alpha );
void* xp = bli_obj_buffer( &x );
void* yp = bli_obj_buffer( &y );
zaxpy_( &nn,
(dcomplex*)alphap,
(dcomplex*)xp, &incx,
(dcomplex*)yp, &incy );
}
#endif
#ifdef PRINT
bli_printm( "y after", &y, "%4.1f", "" );
exit(1);
bli_printm( "y after", &y, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 );
gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 );
if ( bli_obj_is_complex( &x ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_axpyv_blis" );
printf( "data_axpyv_blis" );
#else
printf( "data_axpyv_%s", BLAS );
printf( "data_axpyv_%s", BLAS );
#endif
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )n, gflops );
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )n, gflops );
bli_obj_free( &alpha );
bli_obj_free( &alpha );
bli_obj_free( &x );
bli_obj_free( &y );
bli_obj_free( &y_save );
}
bli_obj_free( &x );
bli_obj_free( &y );
bli_obj_free( &y_save );
}
bli_finalize();
bli_finalize();
return 0;
return 0;
}