Adding a simd kernel for copyv function

Details:
    - Separate kernel for copyv function added to improve performance.
    - Modified cntx_init file in zen and zen2 configuration
    - Added test_copyv.c in test folder
    - Modified test/Makefile to include test_copyv.c

Change-Id: I297f539f2ddd2d71997b127a71a460991cd07b41
Signed-off-by: Kiran N D <kiran.Devrajegowda@amd.com>
AMD-Internal: [CPUPL-818]
This commit is contained in:
Devrajegowda, Kiran
2020-04-16 12:04:10 +05:30
committed by Kiran Devrajegowda
parent ba00f75f64
commit 4caee59466
10 changed files with 896 additions and 35 deletions

View File

@@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
12,
14,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
@@ -110,6 +110,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
#endif
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
cntx
);

View File

@@ -83,19 +83,20 @@ void bli_cntx_init_zen2( cntx_t* cntx )
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
// dotxv
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
@@ -103,7 +104,11 @@ void bli_cntx_init_zen2( cntx_t* cntx )
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
cntx
//copy
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.

View File

@@ -5,18 +5,19 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -62,25 +63,181 @@ void PASTEF77(ch,blasname) \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \
bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \
(\
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( copy, copyv )
#endif
#ifdef BLIS_CONFIG_ZEN2
void scopy_
(
const f77_int* n,
const float* x, const f77_int* incx,
float* y, const f77_int* incy
)
{
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if (*n < 0)
n0 = (dim_t)0;
else
n0 = (dim_t)(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if (*incx < 0)
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (float*)((x)+(n0 - 1)*(-*incx));
incx0 = (inc_t)(*incx);
}
else
{
x0 = (float*)(x);
incx0 = (inc_t)(*incx);
}
if (*incy < 0)
{
y0 = (y)+(n0 - 1)*(-*incy);
incy0 = (inc_t)(*incy);
}
else
{
y0 = (y);
incy0 = (inc_t)(*incy);
}
/* Call BLIS kernel */
bli_scopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
}
void dcopy_
(
const f77_int* n,
const double* x, const f77_int* incx,
double* y, const f77_int* incy
)
{
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if (*n < 0)
n0 = (dim_t)0;
else
n0 = (dim_t)(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if (*incx < 0)
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (double*)((x)+(n0 - 1)*(-*incx));
incx0 = (inc_t)(*incx);
}
else
{
x0 = (double*)(x);
incx0 = (inc_t)(*incx);
}
if (*incy < 0)
{
y0 = (y)+(n0 - 1)*(-*incy);
incy0 = (inc_t)(*incy);
}
else
{
y0 = (y);
incy0 = (inc_t)(*incy);
}
/* Call BLIS kernel */
bli_dcopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
}
INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv)
#else
INSERT_GENTFUNC_BLAS(copy, copyv)
#endif
#endif

View File

@@ -7,7 +7,10 @@
*
* Written by Keita Teranishi. 2/11/1998
*
* Copyright (C) 2020, Advanced Micro Devices, Inc.
*
*/
#include "cblas.h"
#include "cblas_f77.h"
void cblas_dcopy( f77_int N, const double *X,
@@ -20,6 +23,75 @@ void cblas_dcopy( f77_int N, const double *X,
#define F77_incX incX
#define F77_incY incY
#endif
#ifdef BLIS_CONFIG_ZEN2
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( F77_N < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(F77_N);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( F77_incX < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (double*)((X) + (n0-1)*(-F77_incX));
incx0 = ( inc_t )(F77_incX);
}
else
{
x0 = (double*)(X);
incx0 = ( inc_t )(F77_incX);
}
if ( F77_incY < 0 )
{
y0 = (Y) + (n0-1)*(-F77_incY);
incy0 = ( inc_t )(F77_incY);
}
else
{
y0 = (Y);
incy0 = ( inc_t )(F77_incY);
}
/* Call BLIS kernel */
bli_dcopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
#else
F77_dcopy( &F77_N, X, &F77_incX, Y, &F77_incY);
#endif
}
#endif

View File

@@ -7,6 +7,8 @@
*
* Written by Keita Teranishi. 2/11/1998
*
* Copyright (C) 2020, Advanced Micro Devices, Inc.
*
*/
#include "cblas.h"
#include "cblas_f77.h"
@@ -20,6 +22,77 @@ void cblas_scopy( f77_int N, const float *X,
#define F77_incX incX
#define F77_incY incY
#endif
F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY);
#ifdef BLIS_CONFIG_ZEN2
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( F77_N < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(F77_N);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( F77_incX < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = (float*)((X) + (n0-1)*(-F77_incX));
incx0 = ( inc_t )(F77_incX);
}
else
{
x0 = (float*)(X);
incx0 = ( inc_t )(F77_incX);
}
if ( F77_incY < 0 )
{
y0 = (Y) + (n0-1)*(-F77_incY);
incy0 = ( inc_t )(F77_incY);
}
else
{
y0 = (Y);
incy0 = ( inc_t )(F77_incY);
}
/* Call BLIS kernel */
bli_scopyv_zen_int
(
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
#else
F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY);
#endif
}
#endif

View File

@@ -56,12 +56,10 @@ GENTFUNC( double, d, blasname, blisname ) \
GENTFUNC( scomplex, c, blasname, blisname ) \
GENTFUNC( dcomplex, z, blasname, blisname )
#define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \
\
GENTFUNC( scomplex, c, blasname, blisname ) \
GENTFUNC( dcomplex, z, blasname, blisname )
// -- Basic one-operand macro with real domain only --

View File

@@ -0,0 +1,330 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
// -----------------------------------------------------------------------------
void bli_scopyv_zen_int
(
conj_t conjx,
dim_t n,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t num_elem_per_reg = 8;
__m256 xv[16];
dim_t i = 0;
// If the vector dimension is zero return early.
if (bli_zero_dim1(n)) return;
if (incx == 1 && incy == 1)
{
#if 0
PRAGMA_SIMD
for (i = 0; i < n; i++)
{
y[i] = x[i];
}
#endif
#if 0
memcpy(y, x, n << 2);
#endif
#if 1
// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
// for example if n = 255
// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
for (i = 0; i < (n & (~0x7F)); i += 128)
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
y += 128;
x += 128;
}
for (; i < (n & (~0x3F)); i += 64)
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
y += 64;
x += 64;
}
for (; i < (n & (~0x1F)); i += 32)
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
y += 32;
x += 32;
}
for (; i < (n & (~0x0F)); i += 16)
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
y += 16;
x += 16;
}
for (; i < (n & (~0x07)); i += 8)
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
y += 8;
x += 8;
}
for (; i < n; i++)
{
*y++ = *x++;
}
#endif
}
else
{
for (dim_t i = 0; i < n; ++i)
{
*y = *x;
x += incx;
y += incy;
}
}
}
// -----------------------------------------------------------------------------
void bli_dcopyv_zen_int
(
conj_t conjx,
dim_t n,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t num_elem_per_reg = 4;
__m256d xv[16];
dim_t i = 0;
// If the vector dimension is zero return early.
if (bli_zero_dim1(n)) return;
if (incx == 1 && incy == 1)
{
#if 0
PRAGMA_SIMD
for (i = 0; i < n; ++i)
{
y[i] = x[i];
}
#endif
#if 0
memcpy(y, x, n << 3);
#endif
#if 1
// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
// the copy operation will be done for the multiples of 64
for (i = 0; i < (n & (~0x3F)); i += 64)
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
y += num_elem_per_reg * 16;
x += num_elem_per_reg * 16;
}
for (; i < (n & (~0x1F)); i += 32)
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
y += num_elem_per_reg * 8;
x += num_elem_per_reg * 8;
}
for (; i < (n & (~0xF)); i += 16)
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
y += num_elem_per_reg * 4;
x += num_elem_per_reg * 4;
}
for (; i < (n & (~0x07)); i += 8)
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
y += num_elem_per_reg * 2;
x += num_elem_per_reg * 2;
}
for (; i < (n & (~0x03)); i += 4)
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
y += num_elem_per_reg;
x += num_elem_per_reg;
}
for (; i < n; i++)
{
*y++ = *x++;
}
#endif
}
else
{
for ( i = 0; i < n; ++i)
{
*y = *x;
x += incx;
y += incy;
}
}
}

View File

@@ -71,6 +71,9 @@ SCALV_KER_PROT( double, d, scalv_zen_int )
SWAPV_KER_PROT(float, s, swapv_zen_int8 )
SWAPV_KER_PROT(double, d, swapv_zen_int8 )
// copyv (intrinsics)
COPYV_KER_PROT( float, s, copyv_zen_int )
COPYV_KER_PROT( double, d, copyv_zen_int )
// -- level-1f --

View File

@@ -176,13 +176,14 @@ blis: \
test_swapv_blis.x \
test_trmv_blis.x \
test_trsv_blis.x \
test_copyv_blis.x \
\
test_gemm_blis.x \
test_hemm_blis.x \
test_herk_blis.x \
test_her2k_blis.x \
test_trmm_blis.x \
test_trsm_blis.x
test_trsm_blis.x
openblas: \
test_dotv_openblas.x \
@@ -195,6 +196,7 @@ openblas: \
test_swapv_openblas.x \
test_trmv_openblas.x \
test_trsv_openblas.x \
test_copyv_openblas.x \
\
test_gemm_openblas.x \
test_hemm_openblas.x \
@@ -214,6 +216,7 @@ atlas: \
test_swapv_atlas.x \
test_trmv_atlas.x \
test_trsv_atlas.x \
test_copyv_atlas.x \
\
test_gemm_atlas.x \
test_hemm_atlas.x \
@@ -232,6 +235,7 @@ mkl: test_dotv_mkl.x \
test_swapv_mkl.x \
test_trmv_mkl.x \
test_trsv_mkl.x \
test_copyv_mkl.x \
\
test_gemm_mkl.x \
test_hemm_mkl.x \
@@ -250,6 +254,7 @@ essl: test_dotv_essl.x \
test_swapv_essl.x \
test_trmv_essl.x \
test_trsv_essl.x \
test_copyv_essl.x\
\
test_gemm_essl.x \
test_hemm_essl.x \
@@ -268,6 +273,7 @@ mac: test_dotv_mac.x \
test_swapv_mac.x \
test_trmv_mac.x \
test_trsv_mac.x \
test_copyv_mac.x \
\
test_gemm_mac.x \
test_hemm_mac.x \

214
test/test_copyv.c Normal file
View File

@@ -0,0 +1,214 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include "blis.h"
//#define BLIS_ACCURACY_TEST
#ifdef BLIS_ACCURACY_TEST
bool_t scompare_result(int n, float *x, int incx, float *y, int incy) {
for (int i = 0; i < n; i++) {
if ((*x) != (*y)) {
printf("%4f != %4f at location %d\n", *x, *y, i);
return FALSE;
}
x += incx;
y += incy;
}
return TRUE;
}
bool_t dcompare_result(int n, double *x, int incx, double *y, int incy) {
for (int i = 0; i < n; i++) {
if ((*x) != (*y)) {
printf("%4f != %4f at location %d\n", *x, *y, i);
return FALSE;
}
x += incx;
y += incy;
}
return TRUE;
}
#endif
int main(int argc, char** argv)
{
obj_t x, y;
dim_t n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int n_input, sizeof_dt;
int r, n_repeats;
num_t dt;
double dtime;
double dtime_save;
double Gbps;
//bli_init();
n_repeats = 100000;
#ifndef PRINT
p_begin = 200;
p_end = 100000;
p_inc = 200;
n_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
n_input = 16;
#endif
#if 1
// dt = BLIS_FLOAT;
dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
if (dt == BLIS_DOUBLE)
sizeof_dt = sizeof(double);
else if (dt == BLIS_FLOAT)
sizeof_dt = sizeof(float);
printf("executable\t n\t GBs per sec\n");
for (p = p_begin; p <= p_end; p += p_inc)
{
if (n_input < 0) n = p * (dim_t)abs(n_input);
else n = (dim_t)n_input;
bli_obj_create(dt, n, 1, 0, 0, &x);
bli_obj_create(dt, n, 1, 0, 0, &y);
bli_randm(&x);
dtime_save = DBL_MAX;
for (r = 0; r < n_repeats; ++r)
{
dtime = bli_clock();
#ifdef BLIS
bli_copyv(&x,
&y
);
#else
if (bli_is_float(dt))
{
f77_int nn = bli_obj_length(&x);
f77_int incx = bli_obj_vector_inc(&x);
float* xp = bli_obj_buffer(&x);
f77_int incy = bli_obj_vector_inc(&y);
float* yp = bli_obj_buffer(&y);
scopy_(&nn,
xp, &incx,
yp, &incy);
}
else if (bli_is_double(dt))
{
f77_int nn = bli_obj_length(&x);
f77_int incx = bli_obj_vector_inc(&x);
double* xp = bli_obj_buffer(&x);
f77_int incy = bli_obj_vector_inc(&y);
double* yp = bli_obj_buffer(&y);
dcopy_(&nn,
xp, &incx,
yp, &incy
);
}
#endif
dtime_save = bli_clock_min_diff(dtime_save, dtime);
#ifdef BLIS_ACCURACY_TEST
if (dt == BLIS_FLOAT) {
int nn = bli_obj_length(&x);
int incx = bli_obj_vector_inc(&x);
float* xp = bli_obj_buffer(&x);
int incy = bli_obj_vector_inc(&y);
float* yp = bli_obj_buffer(&y);
if (scompare_result(nn, xp, incx, yp, incy))
printf("Copy Successful\n");
else
printf("ALERT!!! Copy Failed\n");
}
if (dt == BLIS_DOUBLE) {
int nn = bli_obj_length(&x);
int incx = bli_obj_vector_inc(&x);
double* xp = bli_obj_buffer(&x);
int incy = bli_obj_vector_inc(&y);
double* yp = bli_obj_buffer(&y);
if (dcompare_result(nn, xp, incx, yp, incy))
printf("Copy Successful\n");
else
printf("ALERT!!! Copy Failed\n");
}
#endif
}
// Size of the vectors are incrementd by 1000, to test wide range of inputs.
if (p >= 1000)
p_inc = 1000;
if (p >= 10000)
p_inc = 10000;
Gbps = (n * sizeof_dt) / (dtime_save * 1.0e9);
#ifdef BLIS
printf("data_copyv_blis\t");
#else
printf("data_copyv_%s\t", BLAS);
#endif
printf("%4lu\t %7.2f\n",
(unsigned long)n, Gbps);
bli_obj_free(&x);
bli_obj_free(&y);
}
// bli_finalize();
return 0;
}