Optimized dher2 implementation

- Impplemented her2 framework calls for transposed and non
  transposed kernel variants.

- dher2 kernel operate over 4 columns at a time. It computes
  4x4 triangular part of matrix first and remainder part is
  computed in chunk of 4x4 tile upto m rows.

- remainder cases(m < 4) are handled serially.

AMD-Internal: [CPUPL-1968]

Change-Id: I12ae97b2ad673a7fd9b733c607f27b1089142313
This commit is contained in:
Harsh Dave
2021-12-17 02:34:52 -06:00
parent 8b5b2707c1
commit 351269219f
7 changed files with 818 additions and 15 deletions

View File

@@ -218,9 +218,15 @@ void PASTEMAC(ch,varname) \
#ifdef BLIS_CONFIG_EPYC
void post_hemv_8x8(double *a, double *x,
double *y, double *alpha,
dim_t cs_a, dim_t rs_a);
void bli_post_hemv_8x8
(
double *a,
double *x,
double *y,
double *alpha,
dim_t cs_a,
dim_t rs_a
);
void bli_dhemv_unf_var1
(

View File

@@ -217,6 +217,17 @@ void PASTEMAC(ch,varname) \
}
#ifdef BLIS_CONFIG_EPYC
void bli_pre_hemv_8x8
(
double *a,
double *x,
double *y,
double *alpha,
dim_t cs_a,
dim_t rs_a
);
void bli_dhemv_unf_var3
(
uplo_t uplo,

View File

@@ -158,5 +158,217 @@ void PASTEMAC(ch,varname) \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
#ifdef BLIS_CONFIG_EPYC
/**
* Following is function declaration
* that computes her2 for transposed case.
* It handles triangular part of matrix and
* remaining computation in optimal way to
* gain performance improvement.
* a is triangular matrix, x and y are vectors
*/
void bli_dher2_trans_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
);
void bli_dher2_unf_var1
(
uplo_t uplo,
conj_t conjx,
conj_t conjy,
conj_t conjh,
dim_t m,
double* alpha,
double* x, inc_t incx,
double* y, inc_t incy,
double* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
)
{
const num_t dt = PASTEMAC(d,type);
double* x0;
double* chi1;
double* y0;
double* psi1;
double* c10t;
double* gamma11;
double alpha0;
double alpha1;
double alpha0_chi1;
double alpha1_psi1;
double alpha0_chi1_psi1;
double conjx0_chi1;
double conjy1_psi1;
double conjy0_psi1;
dim_t i;
dim_t n_behind;
inc_t rs_ct, cs_ct;
conj_t conj0, conj1;
/* The algorithm will be expressed in terms of the lower triangular
* case;the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters.
*/
if ( bli_is_lower( uplo ) )
{
rs_ct = rs_c;
cs_ct = cs_c;
PASTEMAC(d,copys)( *alpha, alpha0 );
PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_ct = cs_c;
cs_ct = rs_c;
/* Toggle conjugation of conjx/conjy, but only if we are being
* invoked as her2; for syr2, conjx/conjy are unchanged.
*/
conjx = bli_apply_conj( conjh, conjx );
conjy = bli_apply_conj( conjh, conjy );
PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
PASTEMAC(d,copys)( *alpha, alpha1 );
}
/* Apply conjh (which carries the conjugation component of the
* Hermitian transpose, if applicable) to conjx and/or conjy as
* needed to arrive at the effective conjugation for the vector
* subproblems.
*/
conj0 = bli_apply_conj( conjh, conjy );
conj1 = bli_apply_conj( conjh, conjx );
PASTECH(d,axpy2v_ker_ft) kfp_2v;
/* Query the context for the kernel function pointer. */
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
if( (incx == 1) && (incy == 1) && (rs_ct == 1))
{
for ( i = 0; i < m; )
{
n_behind = i;
x0 = x + (0 )*incx;
chi1 = x + (i )*incx;
y0 = y + (0 )*incy;
psi1 = y + (i )*incy;
c10t = c + (i )*rs_ct + (0 )*cs_ct;
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
if((n_behind >= 3))
{
bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
i+=4;
}
else
{
/* Apply conjx and/or conjy to chi1 and/or psi1. */
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have already been conjugated, if needed,
* by conjx and conjy.
*/
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
alpha0_chi1_psi1 );
/* c10t = c10t + alpha * chi1 * y0'; */
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
kfp_2v
(
conj0,
conj1,
n_behind,
&alpha0_chi1,
&alpha1_psi1,
y0, incy,
x0, incx,
c10t, cs_ct,
cntx
);
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
+ conj(alpha) * psi1 * conj(chi1); */
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
i+=1;
}
}
}
else
{
for ( i = 0; i < m; ++i )
{
n_behind = i;
x0 = x + (0 )*incx;
chi1 = x + (i )*incx;
y0 = y + (0 )*incy;
psi1 = y + (i )*incy;
c10t = c + (i )*rs_ct + (0 )*cs_ct;
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
/* Apply conjx and/or conjy to chi1 and/or psi1. */
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have already been conjugated, if needed,
* by conjx and conjy.
*/
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
alpha0_chi1_psi1 );
/* c10t = c10t + alpha * chi1 * y0'; */
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
kfp_2v
(
conj0,
conj1,
n_behind,
&alpha0_chi1,
&alpha1_psi1,
y0, incy,
x0, incx,
c10t, cs_ct,
cntx
);
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
+ conj(alpha) * psi1 * conj(chi1); */
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
}
}
}
GENTFUNC(float, s, her2_unf_var1)
GENTFUNC(scomplex, c, her2_unf_var1)
GENTFUNC(dcomplex, z,her2_unf_var1)
#else
INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
#endif

View File

@@ -166,5 +166,192 @@ void PASTEMAC(ch,varname) \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
#ifdef BLIS_CONFIG_EPYC
/**
* Following is function declaration
* that computes her2 for transposed case.
* It handles triangular part of matrix and
* remaining computation in optimal way to
* gain performance improvement.
* a is triangular matrix, x and y are vectors
*/
void bli_dher2_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
);
void bli_dher2_unf_var4
(
uplo_t uplo,
conj_t conjx,
conj_t conjy,
conj_t conjh,
dim_t m,
double* alpha,
double* x, inc_t incx,
double* y, inc_t incy,
double* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
)
{
double* chi1;
double* x2;
double* psi1;
double* y2;
double* gamma11;
double* c21;
double alpha0;
double alpha0_psi1;
double alpha1_chi1;
double alpha0_chi1_psi1;
dim_t i;
dim_t n_ahead;
inc_t rs_ct, cs_ct;
const num_t dt = PASTEMAC(d,type);
/* The algorithm will be expressed in terms of the lower triangular
* case; the upper triangular case is supported by swapping the row
* and column strides of A and toggling some conj parameters.
*/
if ( bli_is_lower( uplo ) )
{
rs_ct = rs_c;
cs_ct = cs_c;
PASTEMAC(d,copys)( *alpha, alpha0 );
}
else /* if ( bli_is_upper( uplo ) ) */
{
rs_ct = cs_c;
cs_ct = rs_c;
/* Toggle conjugation of conjx/conjy, but only if we are being
* invoked as her2; for syr2, conjx/conjy are unchanged.
*/
PASTEMAC(d,copys)( *alpha, alpha0 );
}
/* Apply conjh (which carries the conjugation component of the
* Hermitian transpose, if applicable) to conjx and/or conjy as
* needed to arrive at the effective conjugation for the vector
* subproblems.
*/
PASTECH(d,axpy2v_ker_ft) kfp_2v;
/* Query the context for the kernel function pointer. */
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
if((incx == 1) && (incy == 1) && (rs_ct == 1))
{
for ( i = 0; i < m; )
{
n_ahead = m - i - 1;
chi1 = x + (i ) * incx;
x2 = x + (i+1) * incx;
psi1 = y + (i ) * incy;
y2 = y + (i+1) * incy;
gamma11 = c + (i ) + (i )*cs_ct;
c21 = c + (i+1) + (i )*cs_ct;
if((n_ahead >= 3))
{
bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
i+= 4;
}
else
{
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have
already been conjugated, if needed, by conjx and
conjy. */
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
alpha0_chi1_psi1 );
/* c21 = c21 + alpha * x2 * conj(psi1); */
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
kfp_2v
(
conjx,
conjy,
n_ahead,
&alpha0_psi1,
&alpha1_chi1,
x2, incx,
y2, incy,
c21, rs_ct,
cntx
);
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
i+=1;
}
}
}
else
{
for ( i = 0; i < m; ++i)
{
n_ahead = m - i - 1;
chi1 = x + (i ) * incx;
x2 = x + (i+1) * incx;
psi1 = y + (i ) * incy;
y2 = y + (i+1) * incy;
gamma11 = c + (i ) + (i )*cs_ct;
c21 = c + (i+1) + (i )*cs_ct;
/* Compute scalars for vector subproblems. */
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
/* Compute alpha * chi1 * conj(psi1) after both chi1
* and psi1 have
already been conjugated, if needed, by conjx and
conjy. */
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
alpha0_chi1_psi1 );
/* c21 = c21 + alpha * x2 * conj(psi1); */
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
kfp_2v
(
conjx,
conjy,
n_ahead,
&alpha0_psi1,
&alpha1_chi1,
x2, incx,
y2, incy,
c21, rs_ct,
cntx
);
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
}
}
}
GENTFUNC(float, s, her2_unf_var4)
GENTFUNC(scomplex, c, her2_unf_var4)
GENTFUNC(dcomplex, z,her2_unf_var4)
#else
INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
#endif

View File

@@ -3,6 +3,7 @@
target_sources("${PROJECT_NAME}"
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_ref.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_zen_int_4.c
)

View File

@@ -0,0 +1,396 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
void bli_dher2_trans_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
)
{
dim_t row = 0;
dim_t rem = m % 4;
/*holds 4 diagonal elements of triangular part of 4x4 tile*/
double a_diag[4] = {0};
/*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
double alpha_chi[4] = {0};
double alpha_psi[4] = {0};
/*Extracts diagonal element and store into a_diag buffer*/
PRAGMA_SIMD
for(dim_t i = 0; i < 4; i++)
{
a_diag[i] = *(a + m + i + (i * lda));
}
__m256d x0, x1, x2, x3;
__m256d y0, y1, y2, y3;
__m256d xr, yr, zero, gamma;
__m256d a0, a1, a2, a3;
zero = _mm256_setzero_pd();
/*Loading elements of x & y vectors*/
x0 = _mm256_loadu_pd(x + m);
y0 = _mm256_loadu_pd(y + m);
/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
x1 = _mm256_broadcast_sd(alpha);
x2 = _mm256_mul_pd(x0, x1);
y0 = _mm256_mul_pd(y0, x1);
/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
_mm256_storeu_pd(alpha_chi, x2);
_mm256_storeu_pd(alpha_psi, y0);
x0 = _mm256_mul_pd(x0, y0);
gamma = _mm256_loadu_pd(a_diag);
gamma = _mm256_add_pd(gamma, x0);
gamma = _mm256_add_pd(gamma, x0);
_mm256_storeu_pd(a_diag, gamma);
/* Broadcasting 4 alpha_psis and alpha_chis which
* are to be used througout the computation of 4x4 tile
* upto m rows.
*/
x0 = _mm256_broadcast_sd(&alpha_chi[0]);
x1 = _mm256_broadcast_sd(&alpha_chi[1]);
x2 = _mm256_broadcast_sd(&alpha_chi[2]);
x3 = _mm256_broadcast_sd(&alpha_chi[3]);
y0 = _mm256_broadcast_sd(&alpha_psi[0]);
y1 = _mm256_broadcast_sd(&alpha_psi[1]);
y2 = _mm256_broadcast_sd(&alpha_psi[2]);
y3 = _mm256_broadcast_sd(&alpha_psi[3]);
/* Loading 4x4 tile of A matrix for
* triangular part computation
*/
a0 = _mm256_loadu_pd(a + (0 * lda) + m);
a1 = _mm256_loadu_pd(a + (1 * lda) + m);
a2 = _mm256_loadu_pd(a + (2 * lda) + m);
a3 = _mm256_loadu_pd(a + (3 * lda) + m);
yr = _mm256_loadu_pd(y);
xr = _mm256_loadu_pd(x);
/*Setting first element of x & y vectors to zero
* to eliminate diagonal element of 1st column
* from computation
*/
xr = _mm256_blend_pd(xr, zero, 0x1);
yr = _mm256_blend_pd(yr, zero, 0x1);
a0 = _mm256_blend_pd(a0, zero, 0x1);
a1 = _mm256_blend_pd(a1, zero, 0x3);
a2 = _mm256_blend_pd(a2, zero, 0x7);
a3 = _mm256_blend_pd(a3, zero, 0xF);
a0 = _mm256_fmadd_pd(xr, y0, a0);
a0 = _mm256_fmadd_pd(yr, x0, a0);
/*Setting two elements of x & y vectors to zero
* to eliminate diagonal element of 2nd column
* from computation
*/
xr = _mm256_blend_pd(xr, zero, 0x3);
yr = _mm256_blend_pd(yr, zero, 0x3);
a1 = _mm256_fmadd_pd(xr, y1, a1);
a1 = _mm256_fmadd_pd(yr, x1, a1);
/*Setting three elements of x & y vectors to zero
* to eliminate diagonal element of 3rd column
* from computation
*/
xr = _mm256_blend_pd(xr, zero, 0x7);
yr = _mm256_blend_pd(yr, zero, 0x7);
a2 = _mm256_fmadd_pd(xr, y2, a2);
a2 = _mm256_fmadd_pd(yr, x2, a2);
_mm256_storeu_pd(a + (0 * lda) + m, a0 );
/* Loading data from memory location first
* so it could be blend with and finally
* gets stored at same location to prevent
* unnecessary data overwriting at nearby
* memory locations
*/
a3 = _mm256_loadu_pd(a + (1 * lda) + m );
a1 = _mm256_blend_pd(a1, a3, 0x1);
_mm256_storeu_pd(a + (1 * lda) + m, a1 );
a3 = _mm256_loadu_pd(a + (2 * lda) + m );
a2 = _mm256_blend_pd(a2, a3, 0x3);
_mm256_storeu_pd(a + (2 * lda) + m, a2 );
/* Triangular part of matrix is computed, remaining
* part is computed in below loop upto m rows.
*/
for(; (row + 4) <= m; row+=4)
{
/* Loading elements of x and y vector */
xr = _mm256_loadu_pd(x + row);
yr = _mm256_loadu_pd(y + row);
/* Loading tile of A matrix of size 4x4 */
a0 = _mm256_loadu_pd(a + row + (0 * lda) );
a1 = _mm256_loadu_pd(a + row + (1 * lda) );
a2 = _mm256_loadu_pd(a + row + (2 * lda) );
a3 = _mm256_loadu_pd(a + row + (3 * lda) );
a0 = _mm256_fmadd_pd(xr, y0, a0);
a1 = _mm256_fmadd_pd(xr, y1, a1);
a2 = _mm256_fmadd_pd(xr, y2, a2);
a3 = _mm256_fmadd_pd(xr, y3, a3);
a0 = _mm256_fmadd_pd(yr, x0, a0);
a1 = _mm256_fmadd_pd(yr, x1, a1);
a2 = _mm256_fmadd_pd(yr, x2, a2);
a3 = _mm256_fmadd_pd(yr, x3, a3);
_mm256_storeu_pd(a + row + (0 * lda), a0);
_mm256_storeu_pd(a + row + (1 * lda), a1);
_mm256_storeu_pd(a + row + (2 * lda), a2);
_mm256_storeu_pd(a + row + (3 * lda), a3);
}
/* Computes remainder cases where m is less than 4 */
if(rem)
{
PRAGMA_SIMD
for(dim_t i = 0; i < 4; i++)
{
for(dim_t j = row; j < m; j++)
{
a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
}
}
}
/* Computing 4 diagonal elements of triangular part of matrix
* and storing result back at corresponding location in matrix A
*/
PRAGMA_SIMD
for(dim_t i = 0; i < 4; i++)
{
*(a + m + i + (i * lda)) = a_diag[i];
}
}
void bli_dher2_zen_int_4
(
double *a,
double *x,
double *y,
double *alpha,
dim_t m,
dim_t lda
)
{
dim_t row = 4;
dim_t rem = m % 4;
/*holds 4 diagonal elements of triangular part of 4x4 tile*/
double a_diag[4] = {0};
/*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
double alpha_chi[4] = {0};
double alpha_psi[4] = {0};
/*Extracts diagonal element and store into a_diag buffer*/
PRAGMA_SIMD
for(dim_t i = 0; i < 4; i++)
{
a_diag[i] = *(a + i + (i * lda));
}
__m256d x0, x1, x2, x3;
__m256d y0, y1, y2, y3;
__m256d xr, yr, zero, gamma;
__m256d a0, a1, a2, a3;
zero = _mm256_setzero_pd();
/*Loading elements of x & y vectors*/
x0 = _mm256_loadu_pd(x);
y0 = _mm256_loadu_pd(y);
/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
x1 = _mm256_broadcast_sd(alpha);
x2 = _mm256_mul_pd(x0, x1);
y0 = _mm256_mul_pd(y0, x1);
/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
_mm256_storeu_pd(alpha_chi, x2);
_mm256_storeu_pd(alpha_psi, y0);
x0 = _mm256_mul_pd(x0, y0);
gamma = _mm256_loadu_pd(a_diag);
gamma = _mm256_add_pd(gamma, x0);
gamma = _mm256_add_pd(gamma, x0);
_mm256_storeu_pd(a_diag, gamma);
/* Broadcasting 4 alpha_psis and alpha_chis which
* are to be used througout the computation of 4x4 tile
* upto m rows.
*/
x0 = _mm256_broadcast_sd(&alpha_chi[0]);
x1 = _mm256_broadcast_sd(&alpha_chi[1]);
x2 = _mm256_broadcast_sd(&alpha_chi[2]);
x3 = _mm256_broadcast_sd(&alpha_chi[3]);
y0 = _mm256_broadcast_sd(&alpha_psi[0]);
y1 = _mm256_broadcast_sd(&alpha_psi[1]);
y2 = _mm256_broadcast_sd(&alpha_psi[2]);
y3 = _mm256_broadcast_sd(&alpha_psi[3]);
/* Loading 4x4 tile of A matrix for
* triangular part computation
*/
a0 = _mm256_loadu_pd(a + (0 * lda) );
a1 = _mm256_loadu_pd(a + (1 * lda) );
a2 = _mm256_loadu_pd(a + (2 * lda) );
a3 = _mm256_loadu_pd(a + (3 * lda) );
yr = _mm256_loadu_pd(y);
xr = _mm256_loadu_pd(x);
/*Setting first element of x & y vectors to zero
* to eliminate diagonal element of 1st column
* from computation
*/
xr = _mm256_blend_pd(xr, zero, 0x1);
yr = _mm256_blend_pd(yr, zero, 0x1);
a0 = _mm256_blend_pd(a0, zero, 0x1);
a1 = _mm256_blend_pd(a1, zero, 0x3);
a2 = _mm256_blend_pd(a2, zero, 0x7);
a3 = _mm256_blend_pd(a3, zero, 0xF);
a0 = _mm256_fmadd_pd(xr, y0, a0);
a0 = _mm256_fmadd_pd(yr, x0, a0);
/*Setting two elements of x & y vectors to zero
* to eliminate diagonal element of 2nd column
* from computation
*/
xr = _mm256_blend_pd(xr, zero, 0x3);
yr = _mm256_blend_pd(yr, zero, 0x3);
a1 = _mm256_fmadd_pd(xr, y1, a1);
a1 = _mm256_fmadd_pd(yr, x1, a1);
/*Setting three elements of x & y vectors to zero
* to eliminate diagonal element of 3rd column
* from computation
*/
xr = _mm256_blend_pd(xr, zero, 0x7);
yr = _mm256_blend_pd(yr, zero, 0x7);
a2 = _mm256_fmadd_pd(xr, y2, a2);
a2 = _mm256_fmadd_pd(yr, x2, a2);
_mm256_storeu_pd(a + (0 * lda), a0 );
/* Loading data from memory location first
* so it could be blend with and finally
* gets stored at same location to prevent
* unnecessary data overwriting at nearby
* memory locations
*/
a3 = _mm256_loadu_pd(a + (1 * lda) );
a1 = _mm256_blend_pd(a1, a3, 0x1);
_mm256_storeu_pd(a + (1 * lda), a1 );
a3 = _mm256_loadu_pd(a + (2 * lda) );
a2 = _mm256_blend_pd(a2, a3, 0x3);
_mm256_storeu_pd(a + (2 * lda), a2 );
/* Triangular part of matrix is computed, remaining
* part is computed in below loop upto m rows.
*/
for(; (row + 4) <= m; row+=4)
{
/* Loading elements of x and y vector */
xr = _mm256_loadu_pd(x + row);
yr = _mm256_loadu_pd(y + row);
/* Loading tile of A matrix of size 4x4 */
a0 = _mm256_loadu_pd(a + row + (0 * lda) );
a1 = _mm256_loadu_pd(a + row + (1 * lda) );
a2 = _mm256_loadu_pd(a + row + (2 * lda) );
a3 = _mm256_loadu_pd(a + row + (3 * lda) );
a0 = _mm256_fmadd_pd(xr, y0, a0);
a1 = _mm256_fmadd_pd(xr, y1, a1);
a2 = _mm256_fmadd_pd(xr, y2, a2);
a3 = _mm256_fmadd_pd(xr, y3, a3);
a0 = _mm256_fmadd_pd(yr, x0, a0);
a1 = _mm256_fmadd_pd(yr, x1, a1);
a2 = _mm256_fmadd_pd(yr, x2, a2);
a3 = _mm256_fmadd_pd(yr, x3, a3);
_mm256_storeu_pd(a + row + (0 * lda), a0);
_mm256_storeu_pd(a + row + (1 * lda), a1);
_mm256_storeu_pd(a + row + (2 * lda), a2);
_mm256_storeu_pd(a + row + (3 * lda), a3);
}
/* Computes remainder cases where m is less than 4 */
if(rem)
{
PRAGMA_SIMD
for(dim_t i = 0; i < 4; i++)
{
for(dim_t j = row; j < m; j++)
{
a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
}
}
}
/* Computing 4 diagonal elements of triangular part of matrix
* and storing result back at corresponding location in matrix A
*/
PRAGMA_SIMD
for(dim_t i = 0; i < 4; i++)
{
*(a + i + (i * lda)) = a_diag[i];
}
}

View File

@@ -32,16 +32,6 @@
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// hemv helper function
void bli_pre_hemv_8x8(double *a, double *x,
double *y, double *alpha,
dim_t cs_a, dim_t rs_a);
void bli_post_hemv_8x8(double *a, double *x,
double *y, double *alpha,
dim_t cs_a, dim_t rs_a);
// -- level-1m --
PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
PACKM_KER_PROT(double, d, packm_6xk_gen_zen)