From b846059bcf96bfdae86bf852982e3628624bcd23 Mon Sep 17 00:00:00 2001 From: Meghana Date: Fri, 17 Apr 2020 12:02:46 +0530 Subject: [PATCH] Added opt kernels for SWAPV Details: -Added SIMD kernels for SWAPV for both single and double precisions. -Modified cntx_init file for zen and zen2 configurations to choose opt kernels for SWAPV. -Added test_swapv.c in test folder. -Modified test/Makefile to include test_swapv.c Change-Id: Ida786eec722e634aee0dacdd51c327823c80f01a Signed-off-by: Meghana Vankadari AMD-Internal: [CPUPL-847] --- config/zen/bli_cntx_init_zen.c | 5 +- config/zen2/bli_cntx_init_zen2.c | 6 +- kernels/zen/1/bli_swapv_zen_int8.c | 359 +++++++++++++++++++++++++++++ kernels/zen/bli_kernels_zen.h | 8 +- test/Makefile | 10 +- test/test_swapv.c | 181 +++++++++++++++ 6 files changed, 564 insertions(+), 5 deletions(-) create mode 100644 kernels/zen/1/bli_swapv_zen_int8.c create mode 100644 test/test_swapv.c diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 78d2bd745..34ecc875d 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( - 10, + 12, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, @@ -108,6 +108,9 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + cntx ); diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index c85628eb9..f3a98f429 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -79,7 +79,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( - 10, + 12, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, @@ -99,6 +99,10 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + //swap + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + cntx ); diff --git a/kernels/zen/1/bli_swapv_zen_int8.c b/kernels/zen/1/bli_swapv_zen_int8.c new file mode 100644 index 000000000..1cc6377a1 --- /dev/null +++ b/kernels/zen/1/bli_swapv_zen_int8.c @@ -0,0 +1,359 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + + +/* Union data structure to access AVX registers + One 256-bit AVX register holds 8 SP elements. */ +typedef union +{ + __m256 v; + float f[8] __attribute__((aligned(64))); +} v8sf_t; + +/* Union data structure to access AVX registers +* One 256-bit AVX register holds 4 DP elements. */ +typedef union +{ + __m256d v; + double d[4] __attribute__((aligned(64))); +} v4df_t; + +// ----------------------------------------------------------------------------- + +void bli_sswapv_zen_int8 + ( + dim_t n, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + + const dim_t n_elem_per_reg = 8; + + dim_t i; + + float* restrict x0; + float* restrict y0; + + __m256 xv[8]; + __m256 yv[8]; + + //If the vector dimension is zero, return early. + if ( bli_zero_dim1( n )) return; + + x0 = x; + y0 = y; + + if(incx == 1 && incy == 1) + { + for( i = 0; ( i + 63 ) < n; i += 64 ) + { + //Load the input values + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); + xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg ); + xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg ); + xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg ); + + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); + yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg ); + yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg ); + yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg ); + + _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); + _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]); + _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]); + _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]); + _mm256_storeu_ps( (x0 + 4*n_elem_per_reg), yv[4]); + _mm256_storeu_ps( (x0 + 5*n_elem_per_reg), yv[5]); + _mm256_storeu_ps( (x0 + 6*n_elem_per_reg), yv[6]); + _mm256_storeu_ps( (x0 + 7*n_elem_per_reg), yv[7]); + + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]); + _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]); + _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]); + _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), xv[4]); + _mm256_storeu_ps( (y0 + 5*n_elem_per_reg), xv[5]); + _mm256_storeu_ps( (y0 + 6*n_elem_per_reg), xv[6]); + _mm256_storeu_ps( (y0 + 7*n_elem_per_reg), xv[7]); + + x0 += 8*n_elem_per_reg; + y0 += 8*n_elem_per_reg; + } + + for( ; ( i + 31 ) < n; i += 32 ) + { + //Load the input values + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); + + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); + + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]); + _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]); + _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]); + + _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); + _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]); + _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]); + _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]); + + x0 += 4*n_elem_per_reg; + y0 += 4*n_elem_per_reg; + } + + + for( ; ( i + 15 ) < n; i += 16 ) + { + //Load the input values + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]); + + _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); + _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]); + + x0 += 2*n_elem_per_reg; + y0 += 2*n_elem_per_reg; + } + + for( ; ( i + 7 ) < n; i += 8 ) + { + //Load the input values + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + + _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); + + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); + + x0 += 1*n_elem_per_reg; + y0 += 1*n_elem_per_reg; + } + + for( ; (i + 0) < n; i += 1 ) + { + PASTEMAC(s,swaps)(x[i], y[i]); + } + } + else + { + for ( i = 0; i < n; ++i ) + { + PASTEMAC(s,swaps)((*x0), (*y0)); + + x0 += incx; + y0 += incy; + } + } + +} + +//-------------------------------------------------------------------------------- + +void bli_dswapv_zen_int8 + ( + dim_t n, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + + const dim_t n_elem_per_reg = 4; + + dim_t i = 0; + + double* restrict x0; + double* restrict y0; + + __m256d xv[8]; + __m256d yv[8]; + + //If the vector dimension is zero, return early. + if ( bli_zero_dim1( n )) return; + + x0 = x; + y0 = y; + + if(incx == 1 && incy == 1) + { + + for( ; ( i + 31 ) < n; i += 32 ) + { + + //Load the input values + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); + xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg ); + xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg ); + xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg ); + + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); + yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg ); + yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg ); + yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg ); + + _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); + _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]); + _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]); + _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]); + _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), yv[4]); + _mm256_storeu_pd( (x0 + 5*n_elem_per_reg), yv[5]); + _mm256_storeu_pd( (x0 + 6*n_elem_per_reg), yv[6]); + _mm256_storeu_pd( (x0 + 7*n_elem_per_reg), yv[7]); + + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]); + _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]); + _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]); + _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), xv[4]); + _mm256_storeu_pd( (y0 + 5*n_elem_per_reg), xv[5]); + _mm256_storeu_pd( (y0 + 6*n_elem_per_reg), xv[6]); + _mm256_storeu_pd( (y0 + 7*n_elem_per_reg), xv[7]); + + x0 += 8*n_elem_per_reg; + y0 += 8*n_elem_per_reg; + } + + for( ; ( i + 15 ) < n; i += 16 ) + { + //Load the input values + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); + + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]); + _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]); + _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]); + + _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); + _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]); + _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]); + _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]); + + x0 += 4*n_elem_per_reg; + y0 += 4*n_elem_per_reg; + } + + for( ; ( i + 7 ) < n; i += 8 ) + { + //Load the input values + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]); + + _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); + _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]); + + x0 += 2*n_elem_per_reg; + y0 += 2*n_elem_per_reg; + } + + for( ; ( i + 3 ) < n; i += 4 ) + { + //Load the input values + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); + + _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); + + x0 += 1*n_elem_per_reg; + y0 += 1*n_elem_per_reg; + } + + for( ; (i + 0) < n; i += 1 ) + { + PASTEMAC(d,swaps)( x[i], y[i]); + } + } + else + { + for ( i = 0; i < n; ++i ) + { + PASTEMAC(d,swaps) ((*x0), (*y0)); + + x0 += incx; + y0 += incy; + } + } + +} + diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index c73888c29..91038f18d 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -66,6 +67,11 @@ SCALV_KER_PROT( double, d, scalv_zen_int ) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) +// swapv (intrinsics) +SWAPV_KER_PROT(float, s, swapv_zen_int8 ) +SWAPV_KER_PROT(double, d, swapv_zen_int8 ) + + // -- level-1f -- // axpyf (intrinsics) @@ -128,4 +134,4 @@ GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) \ No newline at end of file +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) diff --git a/test/Makefile b/test/Makefile index 55ee7ea57..4af1d6e34 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc. +# Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -173,6 +173,7 @@ blis: \ test_hemv_blis.x \ test_her_blis.x \ test_her2_blis.x \ + test_swapv_blis.x \ test_trmv_blis.x \ test_trsv_blis.x \ \ @@ -191,6 +192,7 @@ openblas: \ test_hemv_openblas.x \ test_her_openblas.x \ test_her2_openblas.x \ + test_swapv_openblas.x \ test_trmv_openblas.x \ test_trsv_openblas.x \ \ @@ -209,6 +211,7 @@ atlas: \ test_hemv_atlas.x \ test_her_atlas.x \ test_her2_atlas.x \ + test_swapv_atlas.x \ test_trmv_atlas.x \ test_trsv_atlas.x \ \ @@ -226,6 +229,7 @@ mkl: test_dotv_mkl.x \ test_hemv_mkl.x \ test_her_mkl.x \ test_her2_mkl.x \ + test_swapv_mkl.x \ test_trmv_mkl.x \ test_trsv_mkl.x \ \ @@ -243,6 +247,7 @@ essl: test_dotv_essl.x \ test_hemv_essl.x \ test_her_essl.x \ test_her2_essl.x \ + test_swapv_essl.x \ test_trmv_essl.x \ test_trsv_essl.x \ \ @@ -260,6 +265,7 @@ mac: test_dotv_mac.x \ test_hemv_mac.x \ test_her_mac.x \ test_her2_mac.x \ + test_swapv_mac.x \ test_trmv_mac.x \ test_trsv_mac.x \ \ @@ -328,4 +334,4 @@ clean: cleanx cleanx: - $(RM_F) *.o *.x - + diff --git a/test/test_swapv.c b/test/test_swapv.c new file mode 100644 index 000000000..715151929 --- /dev/null +++ b/test/test_swapv.c @@ -0,0 +1,181 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + +// n x incx y incy +//void dswap_( int*, double*, int*, double*, int* ); +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t x, y; + dim_t n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int n_input; + int r, n_repeats; + num_t dt; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + +#ifndef PRINT + p_begin = 40; + p_end = 8000; + p_inc = 40; + + n_input = -1; +#else + p_begin = 16; + p_end = 16; + p_inc = 1; + + n_input = -1; +#endif + +#if 1 + dt = BLIS_FLOAT; + //dt = BLIS_DOUBLE; +#else + //dt = BLIS_SCOMPLEX; + dt = BLIS_DCOMPLEX; +#endif + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_swapv_blis" ); +#else + printf( "data_swapv_%s", BLAS ); +#endif + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, 0.0 ); + + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) + { + + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + bli_obj_create( dt, n, 1, 0, 0, &x ); + bli_obj_create( dt, n, 1, 0, 0, &y ); + + bli_randm( &x ); + bli_randm( &y ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "x", &x, "%4.1f", "" ); + bli_printm( "y", &y, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_swapv( &x, + &y + ); +#else + if ( bli_is_float( dt ) ) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + float* xp = bli_obj_buffer( &x ); + float* yp = bli_obj_buffer( &y ); + + sswap_( &nn, + xp, &incx, + yp, &incy ); + + } + else if ( bli_is_double( dt ) ) + { + + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + double* xp = bli_obj_buffer( &x ); + double* yp = bli_obj_buffer( &y ); + + dswap_( &nn, + xp, &incx, + yp, &incy ); + } +#endif + +#ifdef PRINT + bli_printm( "X after", &x, "%4.1f", "" ); + bli_printm( "Y after", &y, "%4.1f", "" ); + + exit(1); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( n ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_swapv_blis" ); +#else + printf( "data_swapv_%s", BLAS ); +#endif + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )n, gflops ); + + bli_obj_free( &x ); + bli_obj_free( &y ); + } + + bli_finalize(); + + return 0; +}