Added opt kernels for SWAPV

Details: -Added SIMD kernels for SWAPV for both single and double precisions. -Modified cntx_init file for zen and zen2 configurations to choose opt kernels for SWAPV. -Added test_swapv.c in test folder. -Modified test/Makefile to include test_swapv.c Change-Id: Ida786eec722e634aee0dacdd51c327823c80f01a Signed-off-by: Meghana Vankadari <Meghana.Vankadari@amd.com> AMD-Internal: [CPUPL-847]
2026-05-12 01:59:59 +00:00 · 2020-04-17 12:02:46 +05:30
parent 489d501f2e
commit b846059bcf
6 changed files with 564 additions and 5 deletions
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-	  10,
+	  12,
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
@@ -108,6 +108,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
+	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
+
 	  cntx
 	);

--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -79,7 +79,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-	  10,
+	  12,
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
@@ -99,6 +99,10 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,

+	  //swap
+	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,
+
 	  cntx
 	);

--- a/kernels/zen/1/bli_swapv_zen_int8.c
+++ b/kernels/zen/1/bli_swapv_zen_int8.c
@@ -0,0 +1,359 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+    __m256d v;
+    double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+// -----------------------------------------------------------------------------
+
+void bli_sswapv_zen_int8
+    (
+     dim_t      n,
+     float* restrict x, inc_t incx,
+     float* restrict y, inc_t incy,
+     cntx_t* restrict cntx
+     )
+{
+
+    const dim_t n_elem_per_reg = 8;
+
+    dim_t i;
+
+    float* restrict x0;
+    float* restrict y0;
+
+    __m256      xv[8];
+    __m256      yv[8];
+
+    //If the vector dimension is zero, return early.
+    if ( bli_zero_dim1( n )) return;
+
+    x0 = x;
+    y0 = y;
+
+    if(incx == 1 && incy == 1)
+    {
+        for( i = 0; ( i + 63 ) < n; i += 64 )
+        {
+            //Load the input values
+            xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+            xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+            xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+            xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+            xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
+            xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
+            xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
+            xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+            yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+            yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+            yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
+            yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
+            yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
+            yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
+
+            _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+            _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
+            _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
+            _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
+            _mm256_storeu_ps( (x0 + 4*n_elem_per_reg), yv[4]);
+            _mm256_storeu_ps( (x0 + 5*n_elem_per_reg), yv[5]);
+            _mm256_storeu_ps( (x0 + 6*n_elem_per_reg), yv[6]);
+            _mm256_storeu_ps( (x0 + 7*n_elem_per_reg), yv[7]);
+
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
+            _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
+            _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
+            _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), xv[4]);
+            _mm256_storeu_ps( (y0 + 5*n_elem_per_reg), xv[5]);
+            _mm256_storeu_ps( (y0 + 6*n_elem_per_reg), xv[6]);
+            _mm256_storeu_ps( (y0 + 7*n_elem_per_reg), xv[7]);
+
+            x0 += 8*n_elem_per_reg;
+            y0 += 8*n_elem_per_reg;
+        }
+
+        for( ; ( i + 31 ) < n; i += 32 )
+        {
+            //Load the input values
+            xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+            xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+            xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
+            xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+            yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
+            yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
+
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
+            _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
+            _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
+    
+            _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+            _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
+            _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
+            _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
+    
+            x0 += 4*n_elem_per_reg;
+            y0 += 4*n_elem_per_reg;
+        }
+
+
+        for( ; ( i + 15 ) < n; i += 16 )
+        {
+            //Load the input values
+            xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+            xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
+
+            _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+            _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
+    
+            x0 += 2*n_elem_per_reg;
+            y0 += 2*n_elem_per_reg;
+        }
+
+        for( ; ( i + 7 ) < n; i += 8 )
+        {
+            //Load the input values
+            xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+            _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
+        
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
+
+            x0 += 1*n_elem_per_reg;
+            y0 += 1*n_elem_per_reg;
+        }
+
+        for( ; (i + 0) < n; i += 1 )
+        {
+            PASTEMAC(s,swaps)(x[i], y[i]);
+        }
+    }
+    else
+    {
+        for ( i = 0; i < n; ++i )
+        {
+            PASTEMAC(s,swaps)((*x0), (*y0));
+
+            x0 += incx;
+            y0 += incy;
+        }
+    }
+
+}
+
+//--------------------------------------------------------------------------------
+
+void bli_dswapv_zen_int8
+    (
+     dim_t      n,
+     double* restrict x, inc_t incx,
+     double* restrict y, inc_t incy,
+     cntx_t* restrict cntx
+     )
+{
+
+    const dim_t n_elem_per_reg = 4;
+
+    dim_t i = 0;
+
+    double* restrict x0;
+    double* restrict y0;
+
+    __m256d     xv[8];
+    __m256d     yv[8];
+
+    //If the vector dimension is zero, return early.
+    if ( bli_zero_dim1( n )) return;
+
+    x0 = x;
+    y0 = y;
+
+    if(incx == 1 && incy == 1)
+    {
+
+        for( ; ( i + 31 ) < n; i += 32 )
+        {
+
+            //Load the input values
+            xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+            xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+            xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+            xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+            xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
+            xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
+            xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
+            xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+            yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
+            yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
+            yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
+            yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
+
+            _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+            _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
+            _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
+            _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
+            _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), yv[4]);
+            _mm256_storeu_pd( (x0 + 5*n_elem_per_reg), yv[5]);
+            _mm256_storeu_pd( (x0 + 6*n_elem_per_reg), yv[6]);
+            _mm256_storeu_pd( (x0 + 7*n_elem_per_reg), yv[7]);
+
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
+            _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
+            _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
+            _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), xv[4]);
+            _mm256_storeu_pd( (y0 + 5*n_elem_per_reg), xv[5]);
+            _mm256_storeu_pd( (y0 + 6*n_elem_per_reg), xv[6]);
+            _mm256_storeu_pd( (y0 + 7*n_elem_per_reg), xv[7]);
+
+            x0 += 8*n_elem_per_reg;
+            y0 += 8*n_elem_per_reg;
+        }
+
+        for( ; ( i + 15 ) < n; i += 16 )
+        {
+            //Load the input values
+            xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+            xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+            xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
+            xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
+            _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
+            _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
+
+            _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+            _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
+            _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
+            _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
+        
+            x0 += 4*n_elem_per_reg;
+            y0 += 4*n_elem_per_reg;
+        }
+
+        for( ; ( i + 7 ) < n; i += 8 )
+        {
+            //Load the input values
+            xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+            xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
+
+            _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+            _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
+        
+            x0 += 2*n_elem_per_reg;
+            y0 += 2*n_elem_per_reg;
+        }
+
+        for( ; ( i + 3 ) < n; i += 4 )
+        {
+            //Load the input values
+            xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
+
+            yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
+
+            _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
+        
+            x0 += 1*n_elem_per_reg;
+            y0 += 1*n_elem_per_reg;
+        }
+
+        for( ; (i + 0) < n; i += 1 )
+        {
+            PASTEMAC(d,swaps)( x[i], y[i]);
+        }
+    }
+    else
+    {
+        for ( i = 0; i < n; ++i )
+        {
+            PASTEMAC(d,swaps) ((*x0), (*y0));
+
+            x0 += incx;
+            y0 += incy;
+        }
+    }
+
+}
+
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -66,6 +67,11 @@ SCALV_KER_PROT( double,   d, scalv_zen_int )
 	SCALV_KER_PROT( float,    s, scalv_zen_int10 )
 	SCALV_KER_PROT( double,   d, scalv_zen_int10 )

+// swapv (intrinsics)
+SWAPV_KER_PROT(float, 	s, swapv_zen_int8 )
+SWAPV_KER_PROT(double,	d, swapv_zen_int8 )
+
+
 // -- level-1f --

 // axpyf (intrinsics)
@@ -128,4 +134,4 @@ GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x16n )
 GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x16n )
 GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x16n )
 GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x16n )
-GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x16n )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x16n )
--- a/test/Makefile
+++ b/test/Makefile
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
+#  Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -173,6 +173,7 @@ blis: \
       test_hemv_blis.x \
       test_her_blis.x \
       test_her2_blis.x \
+       test_swapv_blis.x \
       test_trmv_blis.x \
       test_trsv_blis.x \
       \
@@ -191,6 +192,7 @@ openblas: \
      test_hemv_openblas.x \
      test_her_openblas.x \
      test_her2_openblas.x \
+      test_swapv_openblas.x \
      test_trmv_openblas.x \
      test_trsv_openblas.x \
      \
@@ -209,6 +211,7 @@ atlas: \
      test_hemv_atlas.x \
      test_her_atlas.x \
      test_her2_atlas.x \
+      test_swapv_atlas.x \
      test_trmv_atlas.x \
      test_trsv_atlas.x \
      \
@@ -226,6 +229,7 @@ mkl:  test_dotv_mkl.x \
      test_hemv_mkl.x \
      test_her_mkl.x \
      test_her2_mkl.x \
+      test_swapv_mkl.x \
      test_trmv_mkl.x \
      test_trsv_mkl.x \
      \
@@ -243,6 +247,7 @@ essl: test_dotv_essl.x \
      test_hemv_essl.x \
      test_her_essl.x \
      test_her2_essl.x \
+      test_swapv_essl.x \
      test_trmv_essl.x \
      test_trsv_essl.x \
      \
@@ -260,6 +265,7 @@ mac:  test_dotv_mac.x \
      test_hemv_mac.x \
      test_her_mac.x \
      test_her2_mac.x \
+      test_swapv_mac.x \
      test_trmv_mac.x \
      test_trsv_mac.x \
      \
@@ -328,4 +334,4 @@ clean: cleanx

 cleanx:
 	- $(RM_F) *.o *.x
-
+	
--- a/test/test_swapv.c
+++ b/test/test_swapv.c
@@ -0,0 +1,181 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+//               n     x      incx      y        incy
+//void  dswap_( int*, double*, int*, double*,   int* );
+//#define PRINT
+
+int main( int argc, char** argv )
+{
+    obj_t x, y;
+    dim_t n;
+    dim_t p;
+    dim_t p_begin, p_end, p_inc;
+    int   n_input;
+    int   r, n_repeats;
+    num_t dt;
+
+    double dtime;
+    double dtime_save;
+    double gflops;
+
+    bli_init();
+
+    n_repeats = 3;
+
+#ifndef PRINT
+    p_begin = 40;
+    p_end   = 8000;
+    p_inc   = 40;
+
+    n_input = -1;
+#else
+    p_begin = 16;
+    p_end   = 16;
+    p_inc   = 1;
+
+    n_input = -1;
+#endif
+
+#if 1
+    dt = BLIS_FLOAT;
+    //dt = BLIS_DOUBLE;
+#else
+    //dt = BLIS_SCOMPLEX;
+    dt = BLIS_DCOMPLEX;
+#endif
+
+    // Begin with initializing the last entry to zero so that
+    // matlab allocates space for the entire array once up-front.
+    for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
+#ifdef BLIS
+    printf( "data_swapv_blis" );
+#else
+    printf( "data_swapv_%s", BLAS );
+#endif
+    printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+            ( unsigned long )(p - p_begin)/p_inc + 1,
+            ( unsigned long )0, 0.0 );
+
+    //for ( p = p_begin; p <= p_end; p += p_inc )
+    for ( p = p_end; p_begin <= p; p -= p_inc )
+    {
+
+        if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
+        else               n =     ( dim_t )    n_input;
+
+        bli_obj_create( dt, n, 1, 0, 0, &x );
+        bli_obj_create( dt, n, 1, 0, 0, &y );
+
+        bli_randm( &x );
+        bli_randm( &y );
+
+        dtime_save = 1.0e9;
+
+        for ( r = 0; r < n_repeats; ++r )
+        {
+
+            dtime = bli_clock();
+
+#ifdef PRINT
+            bli_printm( "x", &x, "%4.1f", "" );
+            bli_printm( "y", &y, "%4.1f", "" );
+#endif
+
+#ifdef BLIS
+
+            bli_swapv( &x,
+                      &y
+                      );
+#else
+            if ( bli_is_float( dt ) )
+            {
+                f77_int nn     = bli_obj_length( &x );
+                f77_int incx   = bli_obj_vector_inc( &x );
+                f77_int incy   = bli_obj_vector_inc( &y );
+                float*  xp     = bli_obj_buffer( &x );
+                float*  yp     = bli_obj_buffer( &y );
+
+                sswap_( &nn,
+                               xp, &incx,
+                               yp, &incy );
+
+            }
+            else if ( bli_is_double( dt ) )
+            {
+
+                f77_int  nn     = bli_obj_length( &x );
+                f77_int  incx   = bli_obj_vector_inc( &x );
+                f77_int  incy   = bli_obj_vector_inc( &y );
+                double*  xp     = bli_obj_buffer( &x );
+                double*  yp     = bli_obj_buffer( &y );
+
+                dswap_( &nn,
+                               xp, &incx,
+                               yp, &incy );
+            }
+#endif
+
+#ifdef PRINT
+            bli_printm( "X after", &x, "%4.1f", "" );
+            bli_printm( "Y after", &y, "%4.1f", "" );
+
+            exit(1);
+#endif
+
+            dtime_save = bli_clock_min_diff( dtime_save, dtime );
+        }
+
+        gflops = ( n ) / ( dtime_save * 1.0e9 );
+
+#ifdef BLIS
+        printf( "data_swapv_blis" );
+#else
+        printf( "data_swapv_%s", BLAS );
+#endif
+        printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
+            ( unsigned long )(p - p_begin)/p_inc + 1,
+                ( unsigned long )n, gflops );
+
+        bli_obj_free( &x );
+        bli_obj_free( &y );
+    }
+
+    bli_finalize();
+
+    return 0;
+}