mirror of
https://github.com/amd/blis.git
synced 2026-05-12 01:59:59 +00:00
Adding a simd kernel for copyv function
Details:
- Separate kernel for copyv function added to improve performance.
- Modified cntx_init file in zen and zen2 configuration
- Added test_copyv.c in test folder
- Modified test/Makefile to include test_copyv.c
Change-Id: I297f539f2ddd2d71997b127a71a460991cd07b41
Signed-off-by: Kiran N D <kiran.Devrajegowda@amd.com>
AMD-Internal: [CPUPL-818]
This commit is contained in:
committed by
Kiran Devrajegowda
parent
ba00f75f64
commit
4caee59466
@@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
12,
|
||||
14,
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
@@ -110,6 +110,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
#endif
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -83,19 +83,20 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
// axpyv
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
|
||||
// dotxv
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
|
||||
@@ -103,7 +104,11 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
|
||||
cntx
|
||||
//copy
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
|
||||
@@ -5,18 +5,19 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
@@ -62,25 +63,181 @@ void PASTEF77(ch,blasname) \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \
|
||||
bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \
|
||||
(\
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
INSERT_GENTFUNC_BLAS( copy, copyv )
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
|
||||
void scopy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const float* x, const f77_int* incx,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if (*n < 0)
|
||||
n0 = (dim_t)0;
|
||||
else
|
||||
n0 = (dim_t)(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if (*incx < 0)
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (float*)((x)+(n0 - 1)*(-*incx));
|
||||
incx0 = (inc_t)(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (float*)(x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if (*incy < 0)
|
||||
{
|
||||
y0 = (y)+(n0 - 1)*(-*incy);
|
||||
incy0 = (inc_t)(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
|
||||
/* Call BLIS kernel */
|
||||
bli_scopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
void dcopy_
|
||||
(
|
||||
const f77_int* n,
|
||||
const double* x, const f77_int* incx,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if (*n < 0)
|
||||
n0 = (dim_t)0;
|
||||
else
|
||||
n0 = (dim_t)(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if (*incx < 0)
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (double*)((x)+(n0 - 1)*(-*incx));
|
||||
incx0 = (inc_t)(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (double*)(x);
|
||||
incx0 = (inc_t)(*incx);
|
||||
}
|
||||
|
||||
if (*incy < 0)
|
||||
{
|
||||
y0 = (y)+(n0 - 1)*(-*incy);
|
||||
incy0 = (inc_t)(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (y);
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
|
||||
/* Call BLIS kernel */
|
||||
bli_dcopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv)
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS(copy, copyv)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -7,7 +7,10 @@
|
||||
*
|
||||
* Written by Keita Teranishi. 2/11/1998
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_dcopy( f77_int N, const double *X,
|
||||
@@ -20,6 +23,75 @@ void cblas_dcopy( f77_int N, const double *X,
|
||||
#define F77_incX incX
|
||||
#define F77_incY incY
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( F77_N < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(F77_N);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( F77_incX < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (double*)((X) + (n0-1)*(-F77_incX));
|
||||
incx0 = ( inc_t )(F77_incX);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (double*)(X);
|
||||
incx0 = ( inc_t )(F77_incX);
|
||||
}
|
||||
|
||||
if ( F77_incY < 0 )
|
||||
{
|
||||
y0 = (Y) + (n0-1)*(-F77_incY);
|
||||
incy0 = ( inc_t )(F77_incY);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (Y);
|
||||
incy0 = ( inc_t )(F77_incY);
|
||||
}
|
||||
|
||||
|
||||
/* Call BLIS kernel */
|
||||
bli_dcopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
#else
|
||||
F77_dcopy( &F77_N, X, &F77_incX, Y, &F77_incY);
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
*
|
||||
* Written by Keita Teranishi. 2/11/1998
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
*
|
||||
*/
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
@@ -20,6 +22,77 @@ void cblas_scopy( f77_int N, const float *X,
|
||||
#define F77_incX incX
|
||||
#define F77_incY incY
|
||||
#endif
|
||||
F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY);
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( F77_N < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(F77_N);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( F77_incX < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = (float*)((X) + (n0-1)*(-F77_incX));
|
||||
incx0 = ( inc_t )(F77_incX);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (float*)(X);
|
||||
incx0 = ( inc_t )(F77_incX);
|
||||
}
|
||||
|
||||
if ( F77_incY < 0 )
|
||||
{
|
||||
y0 = (Y) + (n0-1)*(-F77_incY);
|
||||
incy0 = ( inc_t )(F77_incY);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (Y);
|
||||
incy0 = ( inc_t )(F77_incY);
|
||||
}
|
||||
|
||||
|
||||
/* Call BLIS kernel */
|
||||
bli_scopyv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
#else
|
||||
F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY);
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -56,12 +56,10 @@ GENTFUNC( double, d, blasname, blisname ) \
|
||||
GENTFUNC( scomplex, c, blasname, blisname ) \
|
||||
GENTFUNC( dcomplex, z, blasname, blisname )
|
||||
|
||||
|
||||
#define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \
|
||||
\
|
||||
GENTFUNC( scomplex, c, blasname, blisname ) \
|
||||
GENTFUNC( dcomplex, z, blasname, blisname )
|
||||
|
||||
// -- Basic one-operand macro with real domain only --
|
||||
|
||||
|
||||
|
||||
330
kernels/zen/1/bli_copyv_zen_int.c
Normal file
330
kernels/zen/1/bli_copyv_zen_int.c
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_scopyv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t num_elem_per_reg = 8;
|
||||
__m256 xv[16];
|
||||
dim_t i = 0;
|
||||
|
||||
// If the vector dimension is zero return early.
|
||||
if (bli_zero_dim1(n)) return;
|
||||
|
||||
if (incx == 1 && incy == 1)
|
||||
{
|
||||
#if 0
|
||||
PRAGMA_SIMD
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
y[i] = x[i];
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
memcpy(y, x, n << 2);
|
||||
#endif
|
||||
#if 1
|
||||
|
||||
// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
|
||||
// for example if n = 255
|
||||
// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
|
||||
// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
|
||||
// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
|
||||
for (i = 0; i < (n & (~0x7F)); i += 128)
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
|
||||
xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
|
||||
xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
|
||||
xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
|
||||
xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
|
||||
xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
|
||||
xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
|
||||
xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
|
||||
xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
|
||||
|
||||
y += 128;
|
||||
x += 128;
|
||||
}
|
||||
for (; i < (n & (~0x3F)); i += 64)
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
|
||||
|
||||
y += 64;
|
||||
x += 64;
|
||||
}
|
||||
for (; i < (n & (~0x1F)); i += 32)
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
|
||||
|
||||
y += 32;
|
||||
x += 32;
|
||||
}
|
||||
for (; i < (n & (~0x0F)); i += 16)
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
|
||||
y += 16;
|
||||
x += 16;
|
||||
}
|
||||
for (; i < (n & (~0x07)); i += 8)
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
y += 8;
|
||||
x += 8;
|
||||
}
|
||||
for (; i < n; i++)
|
||||
{
|
||||
*y++ = *x++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
for (dim_t i = 0; i < n; ++i)
|
||||
{
|
||||
*y = *x;
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_dcopyv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t num_elem_per_reg = 4;
|
||||
__m256d xv[16];
|
||||
dim_t i = 0;
|
||||
|
||||
// If the vector dimension is zero return early.
|
||||
if (bli_zero_dim1(n)) return;
|
||||
|
||||
if (incx == 1 && incy == 1)
|
||||
{
|
||||
#if 0
|
||||
PRAGMA_SIMD
|
||||
for (i = 0; i < n; ++i)
|
||||
{
|
||||
y[i] = x[i];
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
memcpy(y, x, n << 3);
|
||||
#endif
|
||||
#if 1
|
||||
// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
|
||||
// the copy operation will be done for the multiples of 64
|
||||
for (i = 0; i < (n & (~0x3F)); i += 64)
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
|
||||
xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
|
||||
xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
|
||||
xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
|
||||
xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
|
||||
xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
|
||||
xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
|
||||
xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
|
||||
xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
|
||||
y += num_elem_per_reg * 16;
|
||||
x += num_elem_per_reg * 16;
|
||||
}
|
||||
for (; i < (n & (~0x1F)); i += 32)
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
|
||||
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
|
||||
|
||||
y += num_elem_per_reg * 8;
|
||||
x += num_elem_per_reg * 8;
|
||||
}
|
||||
for (; i < (n & (~0xF)); i += 16)
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
|
||||
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
|
||||
|
||||
y += num_elem_per_reg * 4;
|
||||
x += num_elem_per_reg * 4;
|
||||
}
|
||||
for (; i < (n & (~0x07)); i += 8)
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
|
||||
y += num_elem_per_reg * 2;
|
||||
x += num_elem_per_reg * 2;
|
||||
}
|
||||
for (; i < (n & (~0x03)); i += 4)
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
y += num_elem_per_reg;
|
||||
x += num_elem_per_reg;
|
||||
}
|
||||
for (; i < n; i++)
|
||||
{
|
||||
*y++ = *x++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < n; ++i)
|
||||
{
|
||||
*y = *x;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,6 +71,9 @@ SCALV_KER_PROT( double, d, scalv_zen_int )
|
||||
SWAPV_KER_PROT(float, s, swapv_zen_int8 )
|
||||
SWAPV_KER_PROT(double, d, swapv_zen_int8 )
|
||||
|
||||
// copyv (intrinsics)
|
||||
COPYV_KER_PROT( float, s, copyv_zen_int )
|
||||
COPYV_KER_PROT( double, d, copyv_zen_int )
|
||||
|
||||
// -- level-1f --
|
||||
|
||||
|
||||
@@ -176,13 +176,14 @@ blis: \
|
||||
test_swapv_blis.x \
|
||||
test_trmv_blis.x \
|
||||
test_trsv_blis.x \
|
||||
test_copyv_blis.x \
|
||||
\
|
||||
test_gemm_blis.x \
|
||||
test_hemm_blis.x \
|
||||
test_herk_blis.x \
|
||||
test_her2k_blis.x \
|
||||
test_trmm_blis.x \
|
||||
test_trsm_blis.x
|
||||
test_trsm_blis.x
|
||||
|
||||
openblas: \
|
||||
test_dotv_openblas.x \
|
||||
@@ -195,6 +196,7 @@ openblas: \
|
||||
test_swapv_openblas.x \
|
||||
test_trmv_openblas.x \
|
||||
test_trsv_openblas.x \
|
||||
test_copyv_openblas.x \
|
||||
\
|
||||
test_gemm_openblas.x \
|
||||
test_hemm_openblas.x \
|
||||
@@ -214,6 +216,7 @@ atlas: \
|
||||
test_swapv_atlas.x \
|
||||
test_trmv_atlas.x \
|
||||
test_trsv_atlas.x \
|
||||
test_copyv_atlas.x \
|
||||
\
|
||||
test_gemm_atlas.x \
|
||||
test_hemm_atlas.x \
|
||||
@@ -232,6 +235,7 @@ mkl: test_dotv_mkl.x \
|
||||
test_swapv_mkl.x \
|
||||
test_trmv_mkl.x \
|
||||
test_trsv_mkl.x \
|
||||
test_copyv_mkl.x \
|
||||
\
|
||||
test_gemm_mkl.x \
|
||||
test_hemm_mkl.x \
|
||||
@@ -250,6 +254,7 @@ essl: test_dotv_essl.x \
|
||||
test_swapv_essl.x \
|
||||
test_trmv_essl.x \
|
||||
test_trsv_essl.x \
|
||||
test_copyv_essl.x\
|
||||
\
|
||||
test_gemm_essl.x \
|
||||
test_hemm_essl.x \
|
||||
@@ -268,6 +273,7 @@ mac: test_dotv_mac.x \
|
||||
test_swapv_mac.x \
|
||||
test_trmv_mac.x \
|
||||
test_trsv_mac.x \
|
||||
test_copyv_mac.x \
|
||||
\
|
||||
test_gemm_mac.x \
|
||||
test_hemm_mac.x \
|
||||
|
||||
214
test/test_copyv.c
Normal file
214
test/test_copyv.c
Normal file
@@ -0,0 +1,214 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
//#define BLIS_ACCURACY_TEST
|
||||
#ifdef BLIS_ACCURACY_TEST
|
||||
|
||||
bool_t scompare_result(int n, float *x, int incx, float *y, int incy) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
if ((*x) != (*y)) {
|
||||
printf("%4f != %4f at location %d\n", *x, *y, i);
|
||||
return FALSE;
|
||||
}
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t dcompare_result(int n, double *x, int incx, double *y, int incy) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
if ((*x) != (*y)) {
|
||||
printf("%4f != %4f at location %d\n", *x, *y, i);
|
||||
return FALSE;
|
||||
}
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
obj_t x, y;
|
||||
dim_t n;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int n_input, sizeof_dt;
|
||||
int r, n_repeats;
|
||||
num_t dt;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double Gbps;
|
||||
|
||||
//bli_init();
|
||||
|
||||
n_repeats = 100000;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 200;
|
||||
p_end = 100000;
|
||||
p_inc = 200;
|
||||
|
||||
n_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
n_input = 16;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
// dt = BLIS_FLOAT;
|
||||
dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
if (dt == BLIS_DOUBLE)
|
||||
sizeof_dt = sizeof(double);
|
||||
else if (dt == BLIS_FLOAT)
|
||||
sizeof_dt = sizeof(float);
|
||||
|
||||
printf("executable\t n\t GBs per sec\n");
|
||||
for (p = p_begin; p <= p_end; p += p_inc)
|
||||
{
|
||||
|
||||
if (n_input < 0) n = p * (dim_t)abs(n_input);
|
||||
else n = (dim_t)n_input;
|
||||
|
||||
bli_obj_create(dt, n, 1, 0, 0, &x);
|
||||
bli_obj_create(dt, n, 1, 0, 0, &y);
|
||||
bli_randm(&x);
|
||||
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for (r = 0; r < n_repeats; ++r)
|
||||
{
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef BLIS
|
||||
bli_copyv(&x,
|
||||
&y
|
||||
);
|
||||
#else
|
||||
if (bli_is_float(dt))
|
||||
{
|
||||
f77_int nn = bli_obj_length(&x);
|
||||
f77_int incx = bli_obj_vector_inc(&x);
|
||||
float* xp = bli_obj_buffer(&x);
|
||||
f77_int incy = bli_obj_vector_inc(&y);
|
||||
float* yp = bli_obj_buffer(&y);
|
||||
|
||||
scopy_(&nn,
|
||||
xp, &incx,
|
||||
yp, &incy);
|
||||
|
||||
}
|
||||
else if (bli_is_double(dt))
|
||||
{
|
||||
|
||||
f77_int nn = bli_obj_length(&x);
|
||||
f77_int incx = bli_obj_vector_inc(&x);
|
||||
double* xp = bli_obj_buffer(&x);
|
||||
f77_int incy = bli_obj_vector_inc(&y);
|
||||
double* yp = bli_obj_buffer(&y);
|
||||
|
||||
dcopy_(&nn,
|
||||
xp, &incx,
|
||||
yp, &incy
|
||||
);
|
||||
}
|
||||
#endif
|
||||
dtime_save = bli_clock_min_diff(dtime_save, dtime);
|
||||
#ifdef BLIS_ACCURACY_TEST
|
||||
if (dt == BLIS_FLOAT) {
|
||||
int nn = bli_obj_length(&x);
|
||||
int incx = bli_obj_vector_inc(&x);
|
||||
float* xp = bli_obj_buffer(&x);
|
||||
int incy = bli_obj_vector_inc(&y);
|
||||
float* yp = bli_obj_buffer(&y);
|
||||
if (scompare_result(nn, xp, incx, yp, incy))
|
||||
printf("Copy Successful\n");
|
||||
else
|
||||
printf("ALERT!!! Copy Failed\n");
|
||||
}
|
||||
if (dt == BLIS_DOUBLE) {
|
||||
int nn = bli_obj_length(&x);
|
||||
int incx = bli_obj_vector_inc(&x);
|
||||
double* xp = bli_obj_buffer(&x);
|
||||
int incy = bli_obj_vector_inc(&y);
|
||||
double* yp = bli_obj_buffer(&y);
|
||||
if (dcompare_result(nn, xp, incx, yp, incy))
|
||||
printf("Copy Successful\n");
|
||||
else
|
||||
printf("ALERT!!! Copy Failed\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// Size of the vectors are incrementd by 1000, to test wide range of inputs.
|
||||
if (p >= 1000)
|
||||
p_inc = 1000;
|
||||
|
||||
if (p >= 10000)
|
||||
p_inc = 10000;
|
||||
Gbps = (n * sizeof_dt) / (dtime_save * 1.0e9);
|
||||
#ifdef BLIS
|
||||
printf("data_copyv_blis\t");
|
||||
#else
|
||||
printf("data_copyv_%s\t", BLAS);
|
||||
#endif
|
||||
printf("%4lu\t %7.2f\n",
|
||||
(unsigned long)n, Gbps);
|
||||
|
||||
bli_obj_free(&x);
|
||||
bli_obj_free(&y);
|
||||
}
|
||||
|
||||
// bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user