Adding a simd kernel for copyv function

Details: - Separate kernel for copyv function added to improve performance. - Modified cntx_init file in zen and zen2 configuration - Added test_copyv.c in test folder - Modified test/Makefile to include test_copyv.c Change-Id: I297f539f2ddd2d71997b127a71a460991cd07b41 Signed-off-by: Kiran N D <kiran.Devrajegowda@amd.com> AMD-Internal: [CPUPL-818]
2026-05-12 01:59:59 +00:00 · 2020-04-16 12:04:10 +05:30
parent ba00f75f64
commit 4caee59466
10 changed files with 896 additions and 35 deletions
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-	  12,
+	  14,
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
@@ -110,6 +110,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
 #endif
 	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
 	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
+ 
+          BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
+          BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,

 	  cntx
 	);
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -83,19 +83,20 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
-	  // axpyv

+	  // axpyv
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,

 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int10,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int10,
-	  // dotxv
+	  
+          // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
-	  // scalv

+	  // scalv
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,

@@ -103,7 +104,11 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
 	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,

-	  cntx
+	  //copy
+	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
+	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,	
+	 
+          cntx
 	);

 	// Initialize level-3 blocksize objects with architecture-specific values.
--- a/frame/compat/bla_copy.c
+++ b/frame/compat/bla_copy.c
@@ -5,18 +5,19 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -62,25 +63,181 @@ void PASTEF77(ch,blasname) \
 \
 	/* If the input increments are negative, adjust the pointers so we can
 	   use positive increments instead. */ \
-	bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
-	bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
-\
-	/* Call BLIS interface. */ \
-	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
-	( \
-	  BLIS_NO_CONJUGATE, \
-	  n0, \
-	  x0, incx0, \
-	  y0, incy0, \
-	  NULL, \
-	  NULL  \
-	); \
-\
-	/* Finalize BLIS. */ \
-	bli_finalize_auto(); \
+	   bli_convert_blas_incv(n0, (ftype*)x, *incx, x0, incx0); \
+	   bli_convert_blas_incv(n0, (ftype*)y, *incy, y0, incy0); \
+	   \
+	   /* Call BLIS interface. */ \
+	   PASTEMAC2(ch, blisname, BLIS_TAPI_EX_SUF) \
+	   (\
+	   BLIS_NO_CONJUGATE, \
+	   n0, \
+	   x0, incx0, \
+	   y0, incy0, \
+	   NULL, \
+	   NULL  \
+	   ); \
+	   \
+	   /* Finalize BLIS. */ \
+	   bli_finalize_auto(); \
 }

 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTFUNC_BLAS( copy, copyv )
-#endif
+#ifdef BLIS_CONFIG_ZEN2

+void scopy_
+(
+	const f77_int* n,
+	const float*   x, const f77_int* incx,
+	float*   y, const f77_int* incy
+)
+{
+	dim_t  n0;
+	float* x0;
+	float* y0;
+	inc_t  incx0;
+	inc_t  incy0;
+
+	/* Initialize BLIS. */
+//  bli_init_auto();
+
+	/* Convert/typecast negative values of n to zero. */
+	if (*n < 0)
+		n0 = (dim_t)0;
+	else
+		n0 = (dim_t)(*n);
+
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */
+	if (*incx < 0)
+	{
+		/* The semantics of negative stride in BLAS are that the vector
+		operand be traversed in reverse order. (Another way to think
+		of this is that negative strides effectively reverse the order
+		of the vector, but without any explicit data movements.) This
+		is also how BLIS interprets negative strides. The differences
+		is that with BLAS, the caller *always* passes in the 0th (i.e.,
+		top-most or left-most) element of the vector, even when the
+		stride is negative. By contrast, in BLIS, negative strides are
+		used *relative* to the vector address as it is given. Thus, in
+		BLIS, if this backwards traversal is desired, the caller *must*
+		pass in the address to the (n-1)th (i.e., the bottom-most or
+		right-most) element along with a negative stride. */
+
+		x0 = (float*)((x)+(n0 - 1)*(-*incx));
+		incx0 = (inc_t)(*incx);
+
+	}
+	else
+	{
+		x0 = (float*)(x);
+		incx0 = (inc_t)(*incx);
+	}
+
+	if (*incy < 0)
+	{
+		y0 = (y)+(n0 - 1)*(-*incy);
+		incy0 = (inc_t)(*incy);
+
+	}
+	else
+	{
+		y0 = (y);
+		incy0 = (inc_t)(*incy);
+	}
+
+
+	/* Call BLIS kernel */
+	bli_scopyv_zen_int
+	(
+		BLIS_NO_CONJUGATE,
+		n0,
+		x0, incx0,
+		y0, incy0,
+		NULL
+	);
+
+	/* Finalize BLIS. */
+//    bli_finalize_auto();
+}
+
+void dcopy_
+(
+	const f77_int* n,
+	const double*   x, const f77_int* incx,
+	double*   y, const f77_int* incy
+)
+{
+	dim_t  n0;
+	double* x0;
+	double* y0;
+	inc_t  incx0;
+	inc_t  incy0;
+
+	/* Initialize BLIS. */
+//  bli_init_auto();
+
+	/* Convert/typecast negative values of n to zero. */
+	if (*n < 0)
+		n0 = (dim_t)0;
+	else
+		n0 = (dim_t)(*n);
+
+	/* If the input increments are negative, adjust the pointers so we can
+	   use positive increments instead. */
+	if (*incx < 0)
+	{
+		/* The semantics of negative stride in BLAS are that the vector
+		operand be traversed in reverse order. (Another way to think
+		of this is that negative strides effectively reverse the order
+		of the vector, but without any explicit data movements.) This
+		is also how BLIS interprets negative strides. The differences
+		is that with BLAS, the caller *always* passes in the 0th (i.e.,
+		top-most or left-most) element of the vector, even when the
+		stride is negative. By contrast, in BLIS, negative strides are
+		used *relative* to the vector address as it is given. Thus, in
+		BLIS, if this backwards traversal is desired, the caller *must*
+		pass in the address to the (n-1)th (i.e., the bottom-most or
+		right-most) element along with a negative stride. */
+
+		x0 = (double*)((x)+(n0 - 1)*(-*incx));
+		incx0 = (inc_t)(*incx);
+
+	}
+	else
+	{
+		x0 = (double*)(x);
+		incx0 = (inc_t)(*incx);
+	}
+
+	if (*incy < 0)
+	{
+		y0 = (y)+(n0 - 1)*(-*incy);
+		incy0 = (inc_t)(*incy);
+
+	}
+	else
+	{
+		y0 = (y);
+		incy0 = (inc_t)(*incy);
+	}
+
+
+	/* Call BLIS kernel */
+	bli_dcopyv_zen_int
+	(
+		BLIS_NO_CONJUGATE,
+		n0,
+		x0, incx0,
+		y0, incy0,
+		NULL
+	);
+
+	/* Finalize BLIS. */
+//    bli_finalize_auto();
+}
+
+INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv)
+#else
+INSERT_GENTFUNC_BLAS(copy, copyv)
+#endif
+#endif
--- a/frame/compat/cblas/src/cblas_dcopy.c
+++ b/frame/compat/cblas/src/cblas_dcopy.c
@@ -7,7 +7,10 @@
 *
 * Written by Keita Teranishi.  2/11/1998
 *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc.
+ *
 */
+
 #include "cblas.h"
 #include "cblas_f77.h"
 void cblas_dcopy( f77_int N, const double *X,
@@ -20,6 +23,75 @@ void cblas_dcopy( f77_int N, const double *X,
   #define F77_incX incX
   #define F77_incY incY
 #endif
+#ifdef BLIS_CONFIG_ZEN2
+    dim_t  n0;
+    double* x0;
+    double* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( F77_N < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(F77_N);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( F77_incX < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = (double*)((X) + (n0-1)*(-F77_incX));
+        incx0 = ( inc_t )(F77_incX);
+
+    }
+    else
+    {
+        x0    = (double*)(X);
+        incx0 = ( inc_t )(F77_incX);
+    }
+
+    if ( F77_incY < 0 )
+    {
+        y0    = (Y) + (n0-1)*(-F77_incY);
+        incy0 = ( inc_t )(F77_incY);
+
+    }
+    else
+    {
+        y0    = (Y);
+        incy0 = ( inc_t )(F77_incY);
+    }
+
+
+    /* Call BLIS kernel */
+    bli_dcopyv_zen_int
+    (
+	    BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        NULL
+    );
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+#else
   F77_dcopy( &F77_N, X, &F77_incX, Y, &F77_incY);
+#endif
+   
 }
 #endif
--- a/frame/compat/cblas/src/cblas_scopy.c
+++ b/frame/compat/cblas/src/cblas_scopy.c
@@ -7,6 +7,8 @@
 *
 * Written by Keita Teranishi.  2/11/1998
 *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc.
+ *
 */
 #include "cblas.h"
 #include "cblas_f77.h"
@@ -20,6 +22,77 @@ void cblas_scopy( f77_int N, const float *X,
   #define F77_incX incX
   #define F77_incY incY
 #endif
-   F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY);
+#ifdef BLIS_CONFIG_ZEN2
+
+    dim_t  n0;
+    float* x0;
+    float* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+
+    /* Initialize BLIS. */
+//  bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( F77_N < 0 ) n0 = ( dim_t )0;
+    else              n0 = ( dim_t )(F77_N);
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( F77_incX < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = (float*)((X) + (n0-1)*(-F77_incX));
+        incx0 = ( inc_t )(F77_incX);
+
+    }
+    else
+    {
+        x0    = (float*)(X);
+        incx0 = ( inc_t )(F77_incX);
+    }
+
+    if ( F77_incY < 0 )
+    {
+        y0    = (Y) + (n0-1)*(-F77_incY);
+        incy0 = ( inc_t )(F77_incY);
+
+    }
+    else
+    {
+        y0    = (Y);
+        incy0 = ( inc_t )(F77_incY);
+    }
+
+
+    /* Call BLIS kernel */
+    bli_scopyv_zen_int
+    (
+	    BLIS_NO_CONJUGATE,
+        n0,
+        x0, incx0,
+        y0, incy0,
+        NULL
+    );
+
+    /* Finalize BLIS. */
+//    bli_finalize_auto();
+
+#else
+    F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY);
+#endif
+
 }
 #endif
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -56,12 +56,10 @@ GENTFUNC( double,   d, blasname, blisname ) \
 GENTFUNC( scomplex, c, blasname, blisname ) \
 GENTFUNC( dcomplex, z, blasname, blisname )

-
 #define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \
 \
 GENTFUNC( scomplex, c, blasname, blisname ) \
 GENTFUNC( dcomplex, z, blasname, blisname )
-
 // -- Basic one-operand macro with real domain only --


--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -0,0 +1,330 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+void bli_scopyv_zen_int
+(
+	conj_t           conjx,
+	dim_t            n,
+	float*  restrict x, inc_t incx,
+	float*  restrict y, inc_t incy,
+	cntx_t* restrict cntx
+)
+{
+	const dim_t   num_elem_per_reg = 8;
+	__m256  xv[16];
+	dim_t i = 0;
+
+	// If the vector dimension is zero return early.
+	if (bli_zero_dim1(n)) return;
+
+	if (incx == 1 && incy == 1)
+	{
+#if 0
+	  PRAGMA_SIMD
+	  for (i = 0; i < n; i++)
+	  {
+	    y[i] = x[i];
+	  }
+#endif
+#if 0
+        memcpy(y, x, n << 2);
+#endif
+#if 1
+
+		// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
+		// for example if n = 255
+		// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
+		// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
+		// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
+		for (i = 0; i < (n & (~0x7F)); i += 128)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
+
+			y += 128;
+			x += 128;
+		}
+		for (; i < (n & (~0x3F)); i += 64)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+
+			y += 64;
+			x += 64;
+		}
+		for (; i < (n & (~0x1F)); i += 32)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+
+			y += 32;
+			x += 32;
+		}
+		for (; i < (n & (~0x0F)); i += 16)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+
+			y += 16;
+			x += 16;
+		}
+		for (; i < (n & (~0x07)); i += 8)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			y += 8;
+			x += 8;
+		}
+		for (; i < n; i++)
+		{
+			*y++ = *x++;
+		}
+#endif
+	}
+	else
+	{
+		for (dim_t i = 0; i < n; ++i)
+		{
+			*y = *x;
+			x += incx;
+			y += incy;
+		}
+	}
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_dcopyv_zen_int
+(
+	conj_t           conjx,
+	dim_t            n,
+	double*  restrict x, inc_t incx,
+	double*  restrict y, inc_t incy,
+	cntx_t* restrict cntx
+)
+{
+	const dim_t      num_elem_per_reg = 4;
+	__m256d  xv[16];
+	dim_t i = 0;
+
+	// If the vector dimension is zero return early.
+	if (bli_zero_dim1(n)) return;
+
+	if (incx == 1 && incy == 1)
+	{
+#if 0
+		PRAGMA_SIMD
+			for (i = 0; i < n; ++i)
+			{
+				y[i] = x[i];
+			}
+#endif
+#if 0
+		memcpy(y, x, n << 3);
+#endif
+#if 1
+		// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
+		// the copy operation will be done for the multiples of 64
+		for (i = 0; i < (n & (~0x3F)); i += 64)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
+			y += num_elem_per_reg * 16;
+			x += num_elem_per_reg * 16;
+		}
+		for (; i < (n & (~0x1F)); i += 32)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+
+			y += num_elem_per_reg * 8;
+			x += num_elem_per_reg * 8;
+		}
+		for (; i < (n & (~0xF)); i += 16)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+
+			y += num_elem_per_reg * 4;
+			x += num_elem_per_reg * 4;
+		}
+		for (; i < (n & (~0x07)); i += 8)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+
+			y += num_elem_per_reg * 2;
+			x += num_elem_per_reg * 2;
+		}
+		for (; i < (n & (~0x03)); i += 4)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			y += num_elem_per_reg;
+			x += num_elem_per_reg;
+		}
+		for (; i < n; i++)
+		{
+			*y++ = *x++;
+		}
+#endif	
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i)
+		{
+			*y = *x;
+
+			x += incx;
+			y += incy;
+		}
+	}
+}
+
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -71,6 +71,9 @@ SCALV_KER_PROT( double,   d, scalv_zen_int )
 SWAPV_KER_PROT(float, 	s, swapv_zen_int8 )
 SWAPV_KER_PROT(double,	d, swapv_zen_int8 )

+// copyv (intrinsics)
+COPYV_KER_PROT( float,    s, copyv_zen_int )
+COPYV_KER_PROT( double,   d, copyv_zen_int )

 // -- level-1f --

--- a/test/Makefile
+++ b/test/Makefile
@@ -176,13 +176,14 @@ blis: \
       test_swapv_blis.x \
       test_trmv_blis.x \
       test_trsv_blis.x \
+       test_copyv_blis.x \
       \
       test_gemm_blis.x \
       test_hemm_blis.x \
       test_herk_blis.x \
       test_her2k_blis.x \
       test_trmm_blis.x \
-       test_trsm_blis.x
+       test_trsm_blis.x 

 openblas: \
      test_dotv_openblas.x \
@@ -195,6 +196,7 @@ openblas: \
      test_swapv_openblas.x \
      test_trmv_openblas.x \
      test_trsv_openblas.x \
+      test_copyv_openblas.x \
      \
      test_gemm_openblas.x \
      test_hemm_openblas.x \
@@ -214,6 +216,7 @@ atlas: \
      test_swapv_atlas.x \
      test_trmv_atlas.x \
      test_trsv_atlas.x \
+      test_copyv_atlas.x \
      \
      test_gemm_atlas.x \
      test_hemm_atlas.x \
@@ -232,6 +235,7 @@ mkl:  test_dotv_mkl.x \
      test_swapv_mkl.x \
      test_trmv_mkl.x \
      test_trsv_mkl.x \
+      test_copyv_mkl.x \
      \
      test_gemm_mkl.x \
      test_hemm_mkl.x \
@@ -250,6 +254,7 @@ essl: test_dotv_essl.x \
      test_swapv_essl.x \
      test_trmv_essl.x \
      test_trsv_essl.x \
+      test_copyv_essl.x\
      \
      test_gemm_essl.x \
      test_hemm_essl.x \
@@ -268,6 +273,7 @@ mac:  test_dotv_mac.x \
      test_swapv_mac.x \
      test_trmv_mac.x \
      test_trsv_mac.x \
+      test_copyv_mac.x \
      \
      test_gemm_mac.x \
      test_hemm_mac.x \
--- a/test/test_copyv.c
+++ b/test/test_copyv.c
@@ -0,0 +1,214 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <unistd.h>
+#include "blis.h"
+
+
+
+//#define BLIS_ACCURACY_TEST
+#ifdef BLIS_ACCURACY_TEST
+
+bool_t scompare_result(int n, float *x, int incx, float *y, int incy) {
+	for (int i = 0; i < n; i++) {
+		if ((*x) != (*y)) {
+			printf("%4f != %4f at location %d\n", *x, *y, i);
+			return FALSE;
+		}
+		x += incx;
+		y += incy;
+	}
+	return TRUE;
+}
+
+bool_t dcompare_result(int n, double *x, int incx, double *y, int incy) {
+	for (int i = 0; i < n; i++) {
+		if ((*x) != (*y)) {
+			printf("%4f != %4f at location %d\n", *x, *y, i);
+			return FALSE;
+		}
+		x += incx;
+		y += incy;
+	}
+	return TRUE;
+}
+#endif
+
+
+int main(int argc, char** argv)
+{
+	obj_t x, y;
+	dim_t n;
+	dim_t p;
+	dim_t p_begin, p_end, p_inc;
+	int   n_input, sizeof_dt;
+	int   r, n_repeats;
+	num_t dt;
+
+	double dtime;
+	double dtime_save;
+	double Gbps;
+
+	//bli_init();
+
+	n_repeats = 100000;
+
+#ifndef PRINT
+	p_begin = 200;
+	p_end = 100000;
+	p_inc = 200;
+
+	n_input = -1;
+#else
+	p_begin = 16;
+	p_end = 16;
+	p_inc = 1;
+
+	n_input = 16;
+#endif
+
+#if 1
+	 // dt = BLIS_FLOAT;
+	dt = BLIS_DOUBLE;
+#else
+	//dt = BLIS_SCOMPLEX;
+	dt = BLIS_DCOMPLEX;
+#endif
+
+	if (dt == BLIS_DOUBLE)
+		sizeof_dt = sizeof(double);
+	else if (dt == BLIS_FLOAT)
+		sizeof_dt = sizeof(float);
+
+	printf("executable\t n\t GBs per sec\n");
+	for (p = p_begin; p <= p_end; p += p_inc)
+	{
+
+		if (n_input < 0) n = p * (dim_t)abs(n_input);
+		else               n = (dim_t)n_input;
+
+		bli_obj_create(dt, n, 1, 0, 0, &x);
+		bli_obj_create(dt, n, 1, 0, 0, &y);
+		bli_randm(&x);
+
+
+		dtime_save = DBL_MAX;
+
+		for (r = 0; r < n_repeats; ++r)
+		{
+			dtime = bli_clock();
+
+#ifdef BLIS
+			bli_copyv(&x,
+				&y
+			);
+#else
+			if (bli_is_float(dt))
+			{
+				f77_int nn = bli_obj_length(&x);
+				f77_int incx = bli_obj_vector_inc(&x);
+				float*  xp = bli_obj_buffer(&x);
+				f77_int incy = bli_obj_vector_inc(&y);
+				float*  yp = bli_obj_buffer(&y);
+
+				scopy_(&nn,
+					xp, &incx,
+					yp, &incy);
+
+			}
+			else if (bli_is_double(dt))
+			{
+
+				f77_int  nn = bli_obj_length(&x);
+				f77_int  incx = bli_obj_vector_inc(&x);
+				double*  xp = bli_obj_buffer(&x);
+				f77_int incy = bli_obj_vector_inc(&y);
+				double*  yp = bli_obj_buffer(&y);
+
+				dcopy_(&nn,
+					xp, &incx,
+					yp, &incy
+				);
+			}
+#endif
+			dtime_save = bli_clock_min_diff(dtime_save, dtime);
+#ifdef BLIS_ACCURACY_TEST
+			if (dt == BLIS_FLOAT) {
+				int nn = bli_obj_length(&x);
+				int incx = bli_obj_vector_inc(&x);
+				float*  xp = bli_obj_buffer(&x);
+				int incy = bli_obj_vector_inc(&y);
+				float*  yp = bli_obj_buffer(&y);
+				if (scompare_result(nn, xp, incx, yp, incy))
+					printf("Copy Successful\n");
+				else
+					printf("ALERT!!! Copy Failed\n");
+			}
+			if (dt == BLIS_DOUBLE) {
+				int nn = bli_obj_length(&x);
+				int incx = bli_obj_vector_inc(&x);
+				double*  xp = bli_obj_buffer(&x);
+				int incy = bli_obj_vector_inc(&y);
+				double*  yp = bli_obj_buffer(&y);
+				if (dcompare_result(nn, xp, incx, yp, incy))
+					printf("Copy Successful\n");
+				else
+					printf("ALERT!!! Copy Failed\n");
+			}
+#endif
+		}
+		// Size of the vectors are incrementd by 1000, to test wide range of inputs.
+		if (p >= 1000)
+			p_inc = 1000;
+
+		if (p >= 10000)
+			p_inc = 10000;
+		Gbps = (n * sizeof_dt) / (dtime_save * 1.0e9);
+#ifdef BLIS
+		printf("data_copyv_blis\t");
+#else
+		printf("data_copyv_%s\t", BLAS);
+#endif
+		printf("%4lu\t %7.2f\n", 
+			(unsigned long)n, Gbps);
+
+		bli_obj_free(&x);
+		bli_obj_free(&y);
+	}
+
+	//	bli_finalize();
+
+	return 0;
+}
+