diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 34ecc875d..4a4cb8886 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( - 12, + 14, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, @@ -110,6 +110,9 @@ void bli_cntx_init_zen( cntx_t* cntx ) #endif BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, cntx ); diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 6ea5320cc..65e6d7bba 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -83,19 +83,20 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, - // axpyv + // axpyv BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, - // dotxv + + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, - // scalv + // scalv BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, @@ -103,7 +104,11 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - cntx + //copy + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + + cntx ); // Initialize level-3 blocksize objects with architecture-specific values. diff --git a/frame/compat/bla_copy.c b/frame/compat/bla_copy.c index d9d3b7cce..369476b19 100644 --- a/frame/compat/bla_copy.c +++ b/frame/compat/bla_copy.c @@ -5,18 +5,19 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -62,25 +63,181 @@ void PASTEF77(ch,blasname) \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ - bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ - bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ -\ - /* Call BLIS interface. */ \ - PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - n0, \ - x0, incx0, \ - y0, incy0, \ - NULL, \ - NULL \ - ); \ -\ - /* Finalize BLIS. */ \ - bli_finalize_auto(); \ + bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \ + bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \ + \ + /* Call BLIS interface. */ \ + PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \ + (\ + BLIS_NO_CONJUGATE, \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ + \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS -INSERT_GENTFUNC_BLAS( copy, copyv ) -#endif +#ifdef BLIS_CONFIG_ZEN2 +void scopy_ +( + const f77_int* n, + const float* x, const f77_int* incx, + float* y, const f77_int* incy +) +{ + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if (*n < 0) + n0 = (dim_t)0; + else + n0 = (dim_t)(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if (*incx < 0) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (float*)((x)+(n0 - 1)*(-*incx)); + incx0 = (inc_t)(*incx); + + } + else + { + x0 = (float*)(x); + incx0 = (inc_t)(*incx); + } + + if (*incy < 0) + { + y0 = (y)+(n0 - 1)*(-*incy); + incy0 = (inc_t)(*incy); + + } + else + { + y0 = (y); + incy0 = (inc_t)(*incy); + } + + + /* Call BLIS kernel */ + bli_scopyv_zen_int + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); +} + +void dcopy_ +( + const f77_int* n, + const double* x, const f77_int* incx, + double* y, const f77_int* incy +) +{ + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if (*n < 0) + n0 = (dim_t)0; + else + n0 = (dim_t)(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if (*incx < 0) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (double*)((x)+(n0 - 1)*(-*incx)); + incx0 = (inc_t)(*incx); + + } + else + { + x0 = (double*)(x); + incx0 = (inc_t)(*incx); + } + + if (*incy < 0) + { + y0 = (y)+(n0 - 1)*(-*incy); + incy0 = (inc_t)(*incy); + + } + else + { + y0 = (y); + incy0 = (inc_t)(*incy); + } + + + /* Call BLIS kernel */ + bli_dcopyv_zen_int + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); +} + +INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv) +#else +INSERT_GENTFUNC_BLAS(copy, copyv) +#endif +#endif diff --git a/frame/compat/cblas/src/cblas_dcopy.c b/frame/compat/cblas/src/cblas_dcopy.c index 186c3d1d6..e1b354540 100644 --- a/frame/compat/cblas/src/cblas_dcopy.c +++ b/frame/compat/cblas/src/cblas_dcopy.c @@ -7,7 +7,10 @@ * * Written by Keita Teranishi. 2/11/1998 * + * Copyright (C) 2020, Advanced Micro Devices, Inc. + * */ + #include "cblas.h" #include "cblas_f77.h" void cblas_dcopy( f77_int N, const double *X, @@ -20,6 +23,75 @@ void cblas_dcopy( f77_int N, const double *X, #define F77_incX incX #define F77_incY incY #endif +#ifdef BLIS_CONFIG_ZEN2 + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( F77_N < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(F77_N); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( F77_incX < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (double*)((X) + (n0-1)*(-F77_incX)); + incx0 = ( inc_t )(F77_incX); + + } + else + { + x0 = (double*)(X); + incx0 = ( inc_t )(F77_incX); + } + + if ( F77_incY < 0 ) + { + y0 = (Y) + (n0-1)*(-F77_incY); + incy0 = ( inc_t )(F77_incY); + + } + else + { + y0 = (Y); + incy0 = ( inc_t )(F77_incY); + } + + + /* Call BLIS kernel */ + bli_dcopyv_zen_int + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); +#else F77_dcopy( &F77_N, X, &F77_incX, Y, &F77_incY); +#endif + } #endif diff --git a/frame/compat/cblas/src/cblas_scopy.c b/frame/compat/cblas/src/cblas_scopy.c index f54343ff1..b8809d856 100644 --- a/frame/compat/cblas/src/cblas_scopy.c +++ b/frame/compat/cblas/src/cblas_scopy.c @@ -7,6 +7,8 @@ * * Written by Keita Teranishi. 2/11/1998 * + * Copyright (C) 2020, Advanced Micro Devices, Inc. + * */ #include "cblas.h" #include "cblas_f77.h" @@ -20,6 +22,77 @@ void cblas_scopy( f77_int N, const float *X, #define F77_incX incX #define F77_incY incY #endif - F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY); +#ifdef BLIS_CONFIG_ZEN2 + + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( F77_N < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(F77_N); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( F77_incX < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (float*)((X) + (n0-1)*(-F77_incX)); + incx0 = ( inc_t )(F77_incX); + + } + else + { + x0 = (float*)(X); + incx0 = ( inc_t )(F77_incX); + } + + if ( F77_incY < 0 ) + { + y0 = (Y) + (n0-1)*(-F77_incY); + incy0 = ( inc_t )(F77_incY); + + } + else + { + y0 = (Y); + incy0 = ( inc_t )(F77_incY); + } + + + /* Call BLIS kernel */ + bli_scopyv_zen_int + ( + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); + +#else + F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY); +#endif + } #endif diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 3a547f5d4..6e7969a5c 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -56,12 +56,10 @@ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) - #define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \ \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) - // -- Basic one-operand macro with real domain only -- diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c new file mode 100644 index 000000000..9a74ffc28 --- /dev/null +++ b/kernels/zen/1/bli_copyv_zen_int.c @@ -0,0 +1,330 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + +// ----------------------------------------------------------------------------- + +void bli_scopyv_zen_int +( + conj_t conjx, + dim_t n, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + cntx_t* restrict cntx +) +{ + const dim_t num_elem_per_reg = 8; + __m256 xv[16]; + dim_t i = 0; + + // If the vector dimension is zero return early. + if (bli_zero_dim1(n)) return; + + if (incx == 1 && incy == 1) + { +#if 0 + PRAGMA_SIMD + for (i = 0; i < n; i++) + { + y[i] = x[i]; + } +#endif +#if 0 + memcpy(y, x, n << 2); +#endif +#if 1 + + // For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128 + // for example if n = 255 + // n & ~0x7F results in 128: copy from 0 to 128 happens in first loop + // n & ~0x3F results in 192: copy from 128 to 192 happens in second loop + // n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on. + for (i = 0; i < (n & (~0x7F)); i += 128) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); + xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8); + xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9); + xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10); + xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11); + xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12); + xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13); + xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14); + xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); + _mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]); + _mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]); + _mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]); + _mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]); + _mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]); + _mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]); + _mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]); + _mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]); + + y += 128; + x += 128; + } + for (; i < (n & (~0x3F)); i += 64) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); + + y += 64; + x += 64; + } + for (; i < (n & (~0x1F)); i += 32) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); + + y += 32; + x += 32; + } + for (; i < (n & (~0x0F)); i += 16) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + + y += 16; + x += 16; + } + for (; i < (n & (~0x07)); i += 8) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + y += 8; + x += 8; + } + for (; i < n; i++) + { + *y++ = *x++; + } +#endif + } + else + { + for (dim_t i = 0; i < n; ++i) + { + *y = *x; + x += incx; + y += incy; + } + } +} + +// ----------------------------------------------------------------------------- + +void bli_dcopyv_zen_int +( + conj_t conjx, + dim_t n, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* restrict cntx +) +{ + const dim_t num_elem_per_reg = 4; + __m256d xv[16]; + dim_t i = 0; + + // If the vector dimension is zero return early. + if (bli_zero_dim1(n)) return; + + if (incx == 1 && incy == 1) + { +#if 0 + PRAGMA_SIMD + for (i = 0; i < n; ++i) + { + y[i] = x[i]; + } +#endif +#if 0 + memcpy(y, x, n << 3); +#endif +#if 1 + // n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64, + // the copy operation will be done for the multiples of 64 + for (i = 0; i < (n & (~0x3F)); i += 64) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); + xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8); + xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9); + xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10); + xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11); + xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12); + xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13); + xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14); + xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15); + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); + _mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]); + _mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]); + _mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]); + _mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]); + _mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]); + _mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]); + _mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]); + _mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]); + y += num_elem_per_reg * 16; + x += num_elem_per_reg * 16; + } + for (; i < (n & (~0x1F)); i += 32) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); + + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); + + y += num_elem_per_reg * 8; + x += num_elem_per_reg * 8; + } + for (; i < (n & (~0xF)); i += 16) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); + + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); + + y += num_elem_per_reg * 4; + x += num_elem_per_reg * 4; + } + for (; i < (n & (~0x07)); i += 8) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + + y += num_elem_per_reg * 2; + x += num_elem_per_reg * 2; + } + for (; i < (n & (~0x03)); i += 4) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + y += num_elem_per_reg; + x += num_elem_per_reg; + } + for (; i < n; i++) + { + *y++ = *x++; + } +#endif + } + else + { + for ( i = 0; i < n; ++i) + { + *y = *x; + + x += incx; + y += incy; + } + } +} + diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 91038f18d..d9ae558d6 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -71,6 +71,9 @@ SCALV_KER_PROT( double, d, scalv_zen_int ) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) +// copyv (intrinsics) +COPYV_KER_PROT( float, s, copyv_zen_int ) +COPYV_KER_PROT( double, d, copyv_zen_int ) // -- level-1f -- diff --git a/test/Makefile b/test/Makefile index 4af1d6e34..02796ee44 100644 --- a/test/Makefile +++ b/test/Makefile @@ -176,13 +176,14 @@ blis: \ test_swapv_blis.x \ test_trmv_blis.x \ test_trsv_blis.x \ + test_copyv_blis.x \ \ test_gemm_blis.x \ test_hemm_blis.x \ test_herk_blis.x \ test_her2k_blis.x \ test_trmm_blis.x \ - test_trsm_blis.x + test_trsm_blis.x openblas: \ test_dotv_openblas.x \ @@ -195,6 +196,7 @@ openblas: \ test_swapv_openblas.x \ test_trmv_openblas.x \ test_trsv_openblas.x \ + test_copyv_openblas.x \ \ test_gemm_openblas.x \ test_hemm_openblas.x \ @@ -214,6 +216,7 @@ atlas: \ test_swapv_atlas.x \ test_trmv_atlas.x \ test_trsv_atlas.x \ + test_copyv_atlas.x \ \ test_gemm_atlas.x \ test_hemm_atlas.x \ @@ -232,6 +235,7 @@ mkl: test_dotv_mkl.x \ test_swapv_mkl.x \ test_trmv_mkl.x \ test_trsv_mkl.x \ + test_copyv_mkl.x \ \ test_gemm_mkl.x \ test_hemm_mkl.x \ @@ -250,6 +254,7 @@ essl: test_dotv_essl.x \ test_swapv_essl.x \ test_trmv_essl.x \ test_trsv_essl.x \ + test_copyv_essl.x\ \ test_gemm_essl.x \ test_hemm_essl.x \ @@ -268,6 +273,7 @@ mac: test_dotv_mac.x \ test_swapv_mac.x \ test_trmv_mac.x \ test_trsv_mac.x \ + test_copyv_mac.x \ \ test_gemm_mac.x \ test_hemm_mac.x \ diff --git a/test/test_copyv.c b/test/test_copyv.c new file mode 100644 index 000000000..1893bad72 --- /dev/null +++ b/test/test_copyv.c @@ -0,0 +1,214 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + + + +//#define BLIS_ACCURACY_TEST +#ifdef BLIS_ACCURACY_TEST + +bool_t scompare_result(int n, float *x, int incx, float *y, int incy) { + for (int i = 0; i < n; i++) { + if ((*x) != (*y)) { + printf("%4f != %4f at location %d\n", *x, *y, i); + return FALSE; + } + x += incx; + y += incy; + } + return TRUE; +} + +bool_t dcompare_result(int n, double *x, int incx, double *y, int incy) { + for (int i = 0; i < n; i++) { + if ((*x) != (*y)) { + printf("%4f != %4f at location %d\n", *x, *y, i); + return FALSE; + } + x += incx; + y += incy; + } + return TRUE; +} +#endif + + +int main(int argc, char** argv) +{ + obj_t x, y; + dim_t n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int n_input, sizeof_dt; + int r, n_repeats; + num_t dt; + + double dtime; + double dtime_save; + double Gbps; + + //bli_init(); + + n_repeats = 100000; + +#ifndef PRINT + p_begin = 200; + p_end = 100000; + p_inc = 200; + + n_input = -1; +#else + p_begin = 16; + p_end = 16; + p_inc = 1; + + n_input = 16; +#endif + +#if 1 + // dt = BLIS_FLOAT; + dt = BLIS_DOUBLE; +#else + //dt = BLIS_SCOMPLEX; + dt = BLIS_DCOMPLEX; +#endif + + if (dt == BLIS_DOUBLE) + sizeof_dt = sizeof(double); + else if (dt == BLIS_FLOAT) + sizeof_dt = sizeof(float); + + printf("executable\t n\t GBs per sec\n"); + for (p = p_begin; p <= p_end; p += p_inc) + { + + if (n_input < 0) n = p * (dim_t)abs(n_input); + else n = (dim_t)n_input; + + bli_obj_create(dt, n, 1, 0, 0, &x); + bli_obj_create(dt, n, 1, 0, 0, &y); + bli_randm(&x); + + + dtime_save = DBL_MAX; + + for (r = 0; r < n_repeats; ++r) + { + dtime = bli_clock(); + +#ifdef BLIS + bli_copyv(&x, + &y + ); +#else + if (bli_is_float(dt)) + { + f77_int nn = bli_obj_length(&x); + f77_int incx = bli_obj_vector_inc(&x); + float* xp = bli_obj_buffer(&x); + f77_int incy = bli_obj_vector_inc(&y); + float* yp = bli_obj_buffer(&y); + + scopy_(&nn, + xp, &incx, + yp, &incy); + + } + else if (bli_is_double(dt)) + { + + f77_int nn = bli_obj_length(&x); + f77_int incx = bli_obj_vector_inc(&x); + double* xp = bli_obj_buffer(&x); + f77_int incy = bli_obj_vector_inc(&y); + double* yp = bli_obj_buffer(&y); + + dcopy_(&nn, + xp, &incx, + yp, &incy + ); + } +#endif + dtime_save = bli_clock_min_diff(dtime_save, dtime); +#ifdef BLIS_ACCURACY_TEST + if (dt == BLIS_FLOAT) { + int nn = bli_obj_length(&x); + int incx = bli_obj_vector_inc(&x); + float* xp = bli_obj_buffer(&x); + int incy = bli_obj_vector_inc(&y); + float* yp = bli_obj_buffer(&y); + if (scompare_result(nn, xp, incx, yp, incy)) + printf("Copy Successful\n"); + else + printf("ALERT!!! Copy Failed\n"); + } + if (dt == BLIS_DOUBLE) { + int nn = bli_obj_length(&x); + int incx = bli_obj_vector_inc(&x); + double* xp = bli_obj_buffer(&x); + int incy = bli_obj_vector_inc(&y); + double* yp = bli_obj_buffer(&y); + if (dcompare_result(nn, xp, incx, yp, incy)) + printf("Copy Successful\n"); + else + printf("ALERT!!! Copy Failed\n"); + } +#endif + } + // Size of the vectors are incrementd by 1000, to test wide range of inputs. + if (p >= 1000) + p_inc = 1000; + + if (p >= 10000) + p_inc = 10000; + Gbps = (n * sizeof_dt) / (dtime_save * 1.0e9); +#ifdef BLIS + printf("data_copyv_blis\t"); +#else + printf("data_copyv_%s\t", BLAS); +#endif + printf("%4lu\t %7.2f\n", + (unsigned long)n, Gbps); + + bli_obj_free(&x); + bli_obj_free(&y); + } + + // bli_finalize(); + + return 0; +} +