Optimized dher2 implementation

- Impplemented her2 framework calls for transposed and non transposed kernel variants. - dher2 kernel operate over 4 columns at a time. It computes 4x4 triangular part of matrix first and remainder part is computed in chunk of 4x4 tile upto m rows. - remainder cases(m < 4) are handled serially. AMD-Internal: [CPUPL-1968] Change-Id: I12ae97b2ad673a7fd9b733c607f27b1089142313
2026-04-20 07:38:53 +00:00 · 2021-12-17 02:34:52 -06:00
parent 8b5b2707c1
commit 351269219f
7 changed files with 818 additions and 15 deletions
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -218,9 +218,15 @@ void PASTEMAC(ch,varname) \

 #ifdef BLIS_CONFIG_EPYC

-void post_hemv_8x8(double *a, double *x,
-		double *y, double *alpha,
-		dim_t cs_a, dim_t rs_a);
+void bli_post_hemv_8x8
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t cs_a,
+       dim_t rs_a
+     );

 void bli_dhemv_unf_var1
     (
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -217,6 +217,17 @@ void PASTEMAC(ch,varname) \
 }

 #ifdef BLIS_CONFIG_EPYC
+
+void bli_pre_hemv_8x8
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t cs_a,
+       dim_t rs_a
+      );
+
 void bli_dhemv_unf_var3
     (
       uplo_t  uplo,
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -158,5 +158,217 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
+
+#ifdef BLIS_CONFIG_EPYC
+
+/**
+ * Following is function declaration
+ * that computes her2 for transposed case.
+ * It handles triangular part of matrix and
+ * remaining computation in optimal way to
+ * gain performance improvement.
+ * a is triangular matrix, x and y are vectors
+ */
+void bli_dher2_trans_zen_int_4
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t m,
+       dim_t lda
+     );
+
+void bli_dher2_unf_var1
+     (
+       uplo_t  uplo,
+       conj_t  conjx,
+       conj_t  conjy,
+       conj_t  conjh,
+       dim_t   m,
+       double*  alpha,
+       double*  x, inc_t incx,
+       double*  y, inc_t incy,
+       double*  c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx
+     )
+{
+        const num_t dt = PASTEMAC(d,type);
+
+        double*  x0;
+        double*  chi1;
+        double*  y0;
+        double*  psi1;
+        double*  c10t;
+        double*  gamma11;
+        double   alpha0;
+        double   alpha1;
+        double   alpha0_chi1;
+        double   alpha1_psi1;
+        double   alpha0_chi1_psi1;
+        double   conjx0_chi1;
+        double   conjy1_psi1;
+        double   conjy0_psi1;
+        dim_t   i;
+        dim_t   n_behind;
+        inc_t   rs_ct, cs_ct;
+        conj_t  conj0, conj1;
+
+        /* The algorithm will be expressed in terms of the lower triangular
+	 * case;the upper triangular case is supported by swapping the row
+	 * and column strides of A and toggling some conj parameters.
+	 */
+        if ( bli_is_lower( uplo ) )
+        {
+                rs_ct = rs_c;
+                cs_ct = cs_c;
+
+                PASTEMAC(d,copys)( *alpha, alpha0 );
+		PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
+	}
+	else /* if ( bli_is_upper( uplo ) ) */
+	{
+		rs_ct = cs_c;
+		cs_ct = rs_c;
+
+		/* Toggle conjugation of conjx/conjy, but only if we are being
+		 * invoked as her2; for syr2, conjx/conjy are unchanged.
+		 */
+		conjx = bli_apply_conj( conjh, conjx );
+		conjy = bli_apply_conj( conjh, conjy );
+
+		PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
+		PASTEMAC(d,copys)( *alpha, alpha1 );
+	}
+
+	/* Apply conjh (which carries the conjugation component of the
+	 * Hermitian transpose, if applicable) to conjx and/or conjy as
+	 * needed to arrive at the effective conjugation for the vector
+	 * subproblems.
+	 */
+        conj0 = bli_apply_conj( conjh, conjy );
+        conj1 = bli_apply_conj( conjh, conjx );
+
+        PASTECH(d,axpy2v_ker_ft) kfp_2v;
+
+        /* Query the context for the kernel function pointer. */
+        kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
+
+	if( (incx == 1) && (incy == 1) && (rs_ct == 1))
+	{
+		for ( i = 0; i < m; )
+		{
+			n_behind = i;
+			x0       = x + (0  )*incx;
+			chi1     = x + (i  )*incx;
+			y0       = y + (0  )*incy;
+			psi1     = y + (i  )*incy;
+			c10t     = c + (i  )*rs_ct + (0  )*cs_ct;
+			gamma11  = c + (i  )*rs_ct + (i  )*cs_ct;
+
+			if((n_behind >= 3))
+			{
+				bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
+				i+=4;
+			}
+			else
+			{
+				/* Apply conjx and/or conjy to chi1 and/or psi1. */
+				PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
+				PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
+				PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
+
+				/* Compute scalars for vector subproblems. */
+				PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
+				PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
+
+				/* Compute alpha * chi1 * conj(psi1) after both chi1
+				 * and psi1 have already been conjugated, if needed,
+				 * by conjx and conjy.
+				 */
+				PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
+						alpha0_chi1_psi1 );
+
+				/* c10t = c10t +      alpha  * chi1 * y0'; */
+				/* c10t = c10t + conj(alpha) * psi1 * x0'; */
+				kfp_2v
+					(
+					 conj0,
+					 conj1,
+					 n_behind,
+					 &alpha0_chi1,
+					 &alpha1_psi1,
+					 y0,   incy,
+					 x0,   incx,
+					 c10t, cs_ct,
+					 cntx
+					);
+
+				/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1)
+				   + conj(alpha) * psi1 * conj(chi1); */
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+
+				i+=1;
+			}
+		}
+	}
+	else
+	{
+		for ( i = 0; i < m; ++i )
+		{
+			n_behind = i;
+			x0       = x + (0  )*incx;
+			chi1     = x + (i  )*incx;
+			y0       = y + (0  )*incy;
+			psi1     = y + (i  )*incy;
+			c10t     = c + (i  )*rs_ct + (0  )*cs_ct;
+			gamma11  = c + (i  )*rs_ct + (i  )*cs_ct;
+
+			/* Apply conjx and/or conjy to chi1 and/or psi1. */
+			PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
+			PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
+			PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
+
+			/* Compute scalars for vector subproblems. */
+			PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
+			PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
+
+			/* Compute alpha * chi1 * conj(psi1) after both chi1
+			 * and psi1 have already been conjugated, if needed,
+			 * by conjx and conjy.
+			 */
+			PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
+					alpha0_chi1_psi1 );
+
+			/* c10t = c10t +      alpha  * chi1 * y0'; */
+			/* c10t = c10t + conj(alpha) * psi1 * x0'; */
+			kfp_2v
+				(
+				 conj0,
+				 conj1,
+				 n_behind,
+				 &alpha0_chi1,
+				 &alpha1_psi1,
+				 y0,   incy,
+				 x0,   incx,
+				 c10t, cs_ct,
+				 cntx
+				);
+
+			/* gamma11 = gamma11 +      alpha  * chi1 * conj(psi1)
+			   + conj(alpha) * psi1 * conj(chi1); */
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+
+		}
+	}
+}
+
+GENTFUNC(float, s, her2_unf_var1)
+GENTFUNC(scomplex, c, her2_unf_var1)
+GENTFUNC(dcomplex, z,her2_unf_var1)
+#else
+INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
+#endif

--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -166,5 +166,192 @@ void PASTEMAC(ch,varname) \
 	} \
 }

-INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
+#ifdef BLIS_CONFIG_EPYC
+/**
+ * Following is function declaration
+ * that computes her2 for transposed case.
+ * It handles triangular part of matrix and
+ * remaining computation in optimal way to
+ * gain performance improvement.
+ * a is triangular matrix, x and y are vectors
+ */
+void bli_dher2_zen_int_4
+    (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t m,
+       dim_t lda
+    );
+
+void bli_dher2_unf_var4
+    (
+       uplo_t  uplo,
+       conj_t  conjx,
+       conj_t  conjy,
+       conj_t  conjh,
+       dim_t   m,
+       double*  alpha,
+       double*  x, inc_t incx,
+       double*  y, inc_t incy,
+       double*  c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx
+     )
+{
+
+        double*  chi1;
+        double*  x2;
+        double*  psi1;
+        double*  y2;
+        double*  gamma11;
+        double*  c21;
+        double   alpha0;
+        double   alpha0_psi1;
+        double   alpha1_chi1;
+        double   alpha0_chi1_psi1;
+        dim_t    i;
+	dim_t    n_ahead;
+	inc_t    rs_ct, cs_ct;
+
+	const num_t dt = PASTEMAC(d,type);
+
+	/* The algorithm will be expressed in terms of the lower triangular
+	 * case; the upper triangular case is supported by swapping the row
+	 * and column strides of A and toggling some conj parameters.
+	 */
+	if      ( bli_is_lower( uplo ) )
+	{
+		rs_ct = rs_c;
+		cs_ct = cs_c;
+
+		PASTEMAC(d,copys)( *alpha, alpha0 );
+	}
+	else /* if ( bli_is_upper( uplo ) ) */
+	{
+		rs_ct = cs_c;
+		cs_ct = rs_c;
+
+		/* Toggle conjugation of conjx/conjy, but only if we are being
+		 * invoked as her2; for syr2, conjx/conjy are unchanged.
+		 */
+
+		PASTEMAC(d,copys)( *alpha, alpha0 );
+	}
+	/* Apply conjh (which carries the conjugation component of the
+	 * Hermitian transpose, if applicable) to conjx and/or conjy as
+	 * needed to arrive at the effective conjugation for the vector
+	 * subproblems.
+	 */
+
+	PASTECH(d,axpy2v_ker_ft) kfp_2v;
+
+	/* Query the context for the kernel function pointer. */
+	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
+
+	if((incx == 1) && (incy == 1) && (rs_ct == 1))
+	{
+		for ( i = 0; i < m; )
+		{
+			n_ahead  = m - i - 1;
+			chi1     = x + (i  ) * incx;
+			x2       = x + (i+1) * incx;
+			psi1     = y + (i  ) * incy;
+			y2       = y + (i+1) * incy;
+			gamma11  = c + (i  ) + (i  )*cs_ct;
+			c21      = c + (i+1) + (i  )*cs_ct;
+
+			if((n_ahead >= 3))
+			{
+				bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
+				i+= 4;
+			}
+			else
+			{
+				/* Compute scalars for vector subproblems. */
+				PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
+				PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
+
+				/* Compute alpha * chi1 * conj(psi1) after both chi1
+				 * and psi1 have
+				 already been conjugated, if needed, by conjx and
+				 conjy. */
+				PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
+						alpha0_chi1_psi1 );
+
+				/* c21 = c21 +      alpha  * x2 * conj(psi1); */
+				/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
+
+				kfp_2v
+					(
+					 conjx,
+					 conjy,
+					 n_ahead,
+					 &alpha0_psi1,
+					 &alpha1_chi1,
+					 x2,  incx,
+					 y2,  incy,
+					 c21, rs_ct,
+					 cntx
+					);
+
+
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+				PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+				i+=1;
+			}
+		}
+	}
+	else
+	{
+		for ( i = 0; i < m; ++i)
+		{
+			n_ahead  = m - i - 1;
+			chi1     = x + (i  ) * incx;
+			x2       = x + (i+1) * incx;
+			psi1     = y + (i  ) * incy;
+			y2       = y + (i+1) * incy;
+			gamma11  = c + (i  ) + (i  )*cs_ct;
+			c21      = c + (i+1) + (i  )*cs_ct;
+
+			/* Compute scalars for vector subproblems. */
+			PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
+			PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
+
+			/* Compute alpha * chi1 * conj(psi1) after both chi1
+			 * and psi1 have
+			 already been conjugated, if needed, by conjx and
+			 conjy. */
+			PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
+					alpha0_chi1_psi1 );
+
+			/* c21 = c21 +      alpha  * x2 * conj(psi1); */
+			/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
+
+			kfp_2v
+				(
+				 conjx,
+				 conjy,
+				 n_ahead,
+				 &alpha0_psi1,
+				 &alpha1_chi1,
+				 x2,  incx,
+				 y2,  incy,
+				 c21, rs_ct,
+				 cntx
+				);
+
+
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+			PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
+		}
+	}
+}
+
+GENTFUNC(float, s, her2_unf_var4)
+GENTFUNC(scomplex, c, her2_unf_var4)
+GENTFUNC(dcomplex, z,her2_unf_var4)
+#else
+INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
+#endif

--- a/kernels/zen/2/CMakeLists.txt
+++ b/kernels/zen/2/CMakeLists.txt
@@ -3,6 +3,7 @@
 target_sources("${PROJECT_NAME}"
     PRIVATE
 	 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_ref.c
+	 ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_zen_int_4.c
     )


--- a/kernels/zen/2/bli_her2_zen_int_4.c
+++ b/kernels/zen/2/bli_her2_zen_int_4.c
@@ -0,0 +1,396 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+void bli_dher2_trans_zen_int_4
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t m,
+       dim_t lda
+     )
+{
+	dim_t row = 0;
+	dim_t rem = m  % 4;
+
+	/*holds 4 diagonal elements of triangular part of 4x4 tile*/
+	double a_diag[4] = {0};
+	/*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
+	double alpha_chi[4] = {0};
+	double alpha_psi[4] = {0};
+	/*Extracts diagonal element and store into a_diag buffer*/
+	PRAGMA_SIMD
+		for(dim_t i = 0; i < 4; i++)
+		{
+			a_diag[i] = *(a + m + i + (i * lda));
+		}
+
+	__m256d x0, x1, x2, x3;
+	__m256d y0, y1, y2, y3;
+
+	__m256d xr, yr, zero, gamma;
+	__m256d a0, a1, a2, a3;
+
+	zero = _mm256_setzero_pd();
+
+	/*Loading elements of x & y vectors*/
+	x0 = _mm256_loadu_pd(x + m);
+	y0 = _mm256_loadu_pd(y + m);
+	/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
+	x1 = _mm256_broadcast_sd(alpha);
+
+	x2 = _mm256_mul_pd(x0, x1);
+	y0 = _mm256_mul_pd(y0, x1);
+
+	/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
+	_mm256_storeu_pd(alpha_chi, x2);
+	_mm256_storeu_pd(alpha_psi, y0);
+
+	x0 = _mm256_mul_pd(x0, y0);
+	gamma = _mm256_loadu_pd(a_diag);
+	gamma = _mm256_add_pd(gamma, x0);
+	gamma = _mm256_add_pd(gamma, x0);
+	_mm256_storeu_pd(a_diag, gamma);
+
+	/* Broadcasting 4 alpha_psis and alpha_chis which
+	 * are to be used througout the computation of 4x4 tile
+	 * upto m rows.
+	 */
+	x0 = _mm256_broadcast_sd(&alpha_chi[0]);
+	x1 = _mm256_broadcast_sd(&alpha_chi[1]);
+	x2 = _mm256_broadcast_sd(&alpha_chi[2]);
+	x3 = _mm256_broadcast_sd(&alpha_chi[3]);
+
+	y0 = _mm256_broadcast_sd(&alpha_psi[0]);
+	y1 = _mm256_broadcast_sd(&alpha_psi[1]);
+	y2 = _mm256_broadcast_sd(&alpha_psi[2]);
+	y3 = _mm256_broadcast_sd(&alpha_psi[3]);
+
+	/* Loading 4x4 tile of A matrix for
+	 * triangular part computation
+	 */
+	a0 = _mm256_loadu_pd(a +  (0 * lda) + m);
+	a1 = _mm256_loadu_pd(a +  (1 * lda) + m);
+	a2 = _mm256_loadu_pd(a +  (2 * lda) + m);
+	a3 = _mm256_loadu_pd(a +  (3 * lda) + m);
+
+	yr = _mm256_loadu_pd(y);
+	xr = _mm256_loadu_pd(x);
+
+	/*Setting first element of x & y vectors to zero
+	 * to eliminate diagonal element of 1st column
+	 * from computation
+	 */
+	xr = _mm256_blend_pd(xr, zero, 0x1);
+	yr = _mm256_blend_pd(yr, zero, 0x1);
+	a0 = _mm256_blend_pd(a0, zero, 0x1);
+
+	a1 = _mm256_blend_pd(a1, zero, 0x3);
+	a2 = _mm256_blend_pd(a2, zero, 0x7);
+	a3 = _mm256_blend_pd(a3, zero, 0xF);
+
+	a0 = _mm256_fmadd_pd(xr, y0, a0);
+	a0 = _mm256_fmadd_pd(yr, x0, a0);
+
+	/*Setting two elements of x & y vectors to zero
+	 * to eliminate diagonal element of 2nd column
+	 * from computation
+	 */
+	xr = _mm256_blend_pd(xr, zero, 0x3);
+	yr = _mm256_blend_pd(yr, zero, 0x3);
+	a1 = _mm256_fmadd_pd(xr, y1, a1);
+	a1 = _mm256_fmadd_pd(yr, x1, a1);
+
+	/*Setting three elements of x & y vectors to zero
+	 * to eliminate diagonal element of 3rd column
+	 * from computation
+	 */
+	xr = _mm256_blend_pd(xr, zero, 0x7);
+	yr = _mm256_blend_pd(yr, zero, 0x7);
+	a2 = _mm256_fmadd_pd(xr, y2, a2);
+	a2 = _mm256_fmadd_pd(yr, x2, a2);
+
+	_mm256_storeu_pd(a +  (0 * lda) + m, a0 );
+
+	/* Loading data from memory location first
+	 * so it could be blend with and finally
+	 * gets stored at same location to prevent
+	 * unnecessary data overwriting at nearby
+	 * memory locations
+	 */
+	a3 = _mm256_loadu_pd(a +  (1 * lda) + m );
+	a1 = _mm256_blend_pd(a1, a3, 0x1);
+	_mm256_storeu_pd(a +  (1 * lda) + m, a1 );
+
+	a3 = _mm256_loadu_pd(a +  (2 * lda) + m );
+	a2 = _mm256_blend_pd(a2, a3, 0x3);
+	_mm256_storeu_pd(a +  (2 * lda) + m, a2 );
+
+	/* Triangular part of matrix is computed, remaining
+	 * part is computed in below loop upto m rows.
+	 */
+	for(; (row + 4) <=  m; row+=4)
+	{
+		/* Loading elements of x and y vector */
+		xr = _mm256_loadu_pd(x + row);
+		yr = _mm256_loadu_pd(y + row);
+		/* Loading tile of A matrix of size 4x4 */
+		a0 = _mm256_loadu_pd(a + row  + (0 * lda) );
+		a1 = _mm256_loadu_pd(a + row  + (1 * lda) );
+		a2 = _mm256_loadu_pd(a + row  + (2 * lda) );
+		a3 = _mm256_loadu_pd(a + row  + (3 * lda) );
+
+		a0 = _mm256_fmadd_pd(xr, y0, a0);
+		a1 = _mm256_fmadd_pd(xr, y1, a1);
+		a2 = _mm256_fmadd_pd(xr, y2, a2);
+		a3 = _mm256_fmadd_pd(xr, y3, a3);
+
+		a0 = _mm256_fmadd_pd(yr, x0, a0);
+		a1 = _mm256_fmadd_pd(yr, x1, a1);
+		a2 = _mm256_fmadd_pd(yr, x2, a2);
+		a3 = _mm256_fmadd_pd(yr, x3, a3);
+
+		_mm256_storeu_pd(a + row + (0 * lda), a0);
+		_mm256_storeu_pd(a + row + (1 * lda), a1);
+		_mm256_storeu_pd(a + row + (2 * lda), a2);
+		_mm256_storeu_pd(a + row + (3 * lda), a3);
+	}
+
+	/* Computes remainder cases where m is less than 4 */
+	if(rem)
+	{
+		PRAGMA_SIMD
+			for(dim_t i = 0; i < 4; i++)
+			{
+				for(dim_t j = row; j < m; j++)
+				{
+					a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
+					a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
+				}
+			}
+	}
+
+	/* Computing 4 diagonal elements of triangular part of matrix
+	 * and storing result back at corresponding location in matrix A
+	 */
+	PRAGMA_SIMD
+		for(dim_t i = 0; i < 4; i++)
+		{
+			*(a + m + i + (i * lda)) = a_diag[i];
+		}
+}
+
+
+void bli_dher2_zen_int_4
+     (
+       double *a,
+       double *x,
+       double *y,
+       double *alpha,
+       dim_t m,
+       dim_t lda
+     )
+{
+	dim_t row = 4;
+	dim_t rem = m  % 4;
+
+	 /*holds 4 diagonal elements of triangular part of 4x4 tile*/
+	double a_diag[4] = {0};
+	 /*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
+	double alpha_chi[4] = {0};
+	double alpha_psi[4] = {0};
+	/*Extracts diagonal element and store into a_diag buffer*/
+	PRAGMA_SIMD
+		for(dim_t i = 0; i < 4; i++)
+		{
+			a_diag[i] = *(a + i + (i * lda));
+		}
+
+	__m256d x0, x1, x2, x3;
+	__m256d y0, y1, y2, y3;
+
+	__m256d xr, yr, zero, gamma;
+	__m256d a0, a1, a2, a3;
+
+	zero = _mm256_setzero_pd();
+
+	/*Loading elements of x & y vectors*/
+	x0 = _mm256_loadu_pd(x);
+	y0 = _mm256_loadu_pd(y);
+	/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
+	x1 = _mm256_broadcast_sd(alpha);
+
+	x2 = _mm256_mul_pd(x0, x1);
+	y0 = _mm256_mul_pd(y0, x1);
+
+	/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
+	_mm256_storeu_pd(alpha_chi, x2);
+	_mm256_storeu_pd(alpha_psi, y0);
+
+	x0 = _mm256_mul_pd(x0, y0);
+	gamma = _mm256_loadu_pd(a_diag);
+	gamma = _mm256_add_pd(gamma, x0);
+	gamma = _mm256_add_pd(gamma, x0);
+	_mm256_storeu_pd(a_diag, gamma);
+
+	/* Broadcasting 4 alpha_psis and alpha_chis which
+         * are to be used througout the computation of 4x4 tile
+         * upto m rows.
+         */
+	x0 = _mm256_broadcast_sd(&alpha_chi[0]);
+	x1 = _mm256_broadcast_sd(&alpha_chi[1]);
+	x2 = _mm256_broadcast_sd(&alpha_chi[2]);
+	x3 = _mm256_broadcast_sd(&alpha_chi[3]);
+
+	y0 = _mm256_broadcast_sd(&alpha_psi[0]);
+	y1 = _mm256_broadcast_sd(&alpha_psi[1]);
+	y2 = _mm256_broadcast_sd(&alpha_psi[2]);
+	y3 = _mm256_broadcast_sd(&alpha_psi[3]);
+
+	/* Loading 4x4 tile of A matrix for
+	 * triangular part computation
+	 */
+	a0 = _mm256_loadu_pd(a +  (0 * lda) );
+	a1 = _mm256_loadu_pd(a +  (1 * lda) );
+	a2 = _mm256_loadu_pd(a +  (2 * lda) );
+	a3 = _mm256_loadu_pd(a +  (3 * lda) );
+
+	yr = _mm256_loadu_pd(y);
+	xr = _mm256_loadu_pd(x);
+
+	/*Setting first element of x & y vectors to zero
+	 * to eliminate diagonal element of 1st column
+	 * from computation
+	 */
+	xr = _mm256_blend_pd(xr, zero, 0x1);
+	yr = _mm256_blend_pd(yr, zero, 0x1);
+	a0 = _mm256_blend_pd(a0, zero, 0x1);
+	a1 = _mm256_blend_pd(a1, zero, 0x3);
+	a2 = _mm256_blend_pd(a2, zero, 0x7);
+	a3 = _mm256_blend_pd(a3, zero, 0xF);
+
+	a0 = _mm256_fmadd_pd(xr, y0, a0);
+	a0 = _mm256_fmadd_pd(yr, x0, a0);
+
+	/*Setting two elements of x & y vectors to zero
+         * to eliminate diagonal element of 2nd column
+         * from computation
+         */
+	xr = _mm256_blend_pd(xr, zero, 0x3);
+	yr = _mm256_blend_pd(yr, zero, 0x3);
+	a1 = _mm256_fmadd_pd(xr, y1, a1);
+	a1 = _mm256_fmadd_pd(yr, x1, a1);
+
+	/*Setting three elements of x & y vectors to zero
+	 * to eliminate diagonal element of 3rd column
+	 * from computation
+	 */
+	xr = _mm256_blend_pd(xr, zero, 0x7);
+	yr = _mm256_blend_pd(yr, zero, 0x7);
+	a2 = _mm256_fmadd_pd(xr, y2, a2);
+	a2 = _mm256_fmadd_pd(yr, x2, a2);
+
+	_mm256_storeu_pd(a +  (0 * lda), a0 );
+
+	/* Loading data from memory location first
+	 * so it could be blend with and finally
+	 * gets stored at same location to prevent
+	 * unnecessary data overwriting at nearby
+	 * memory locations
+	 */
+	a3 = _mm256_loadu_pd(a +  (1 * lda) );
+	a1 = _mm256_blend_pd(a1, a3, 0x1);
+	_mm256_storeu_pd(a +  (1 * lda), a1 );
+
+	a3 = _mm256_loadu_pd(a +  (2 * lda) );
+	a2 = _mm256_blend_pd(a2, a3, 0x3);
+	_mm256_storeu_pd(a +  (2 * lda), a2 );
+
+	/* Triangular part of matrix is computed, remaining
+	 * part is computed in below loop upto m rows.
+	 */
+	for(; (row + 4) <=  m; row+=4)
+	{
+		/* Loading elements of x and y vector */
+		xr = _mm256_loadu_pd(x + row);
+		yr = _mm256_loadu_pd(y + row);
+		/* Loading tile of A matrix of size 4x4 */
+		a0 = _mm256_loadu_pd(a + row  + (0 * lda) );
+		a1 = _mm256_loadu_pd(a + row  + (1 * lda) );
+		a2 = _mm256_loadu_pd(a + row  + (2 * lda) );
+		a3 = _mm256_loadu_pd(a + row  + (3 * lda) );
+
+		a0 = _mm256_fmadd_pd(xr, y0, a0);
+		a1 = _mm256_fmadd_pd(xr, y1, a1);
+		a2 = _mm256_fmadd_pd(xr, y2, a2);
+		a3 = _mm256_fmadd_pd(xr, y3, a3);
+
+		a0 = _mm256_fmadd_pd(yr, x0, a0);
+		a1 = _mm256_fmadd_pd(yr, x1, a1);
+		a2 = _mm256_fmadd_pd(yr, x2, a2);
+		a3 = _mm256_fmadd_pd(yr, x3, a3);
+
+		_mm256_storeu_pd(a + row + (0 * lda), a0);
+		_mm256_storeu_pd(a + row + (1 * lda), a1);
+		_mm256_storeu_pd(a + row + (2 * lda), a2);
+		_mm256_storeu_pd(a + row + (3 * lda), a3);
+	}
+
+	/* Computes remainder cases where m is less than 4 */
+	if(rem)
+	{
+		PRAGMA_SIMD
+			for(dim_t i = 0; i < 4; i++)
+			{
+				for(dim_t j = row; j < m; j++)
+				{
+					a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
+					a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
+				}
+			}
+	}
+
+	/* Computing 4 diagonal elements of triangular part of matrix
+	 * and storing result back at corresponding location in matrix A
+	 */
+	PRAGMA_SIMD
+		for(dim_t i = 0; i < 4; i++)
+		{
+			*(a + i + (i * lda)) = a_diag[i];
+		}
+}
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -32,16 +32,6 @@
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */
-// hemv helper function
-void bli_pre_hemv_8x8(double *a, double *x,
-               double *y, double *alpha,
-               dim_t cs_a, dim_t rs_a);
-
-void bli_post_hemv_8x8(double *a, double *x,
-               double *y, double *alpha,
-               dim_t cs_a, dim_t rs_a);
-
-
 // -- level-1m --
 PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
 PACKM_KER_PROT(double, d, packm_6xk_gen_zen)