mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Optimized dher2 implementation
- Impplemented her2 framework calls for transposed and non transposed kernel variants. - dher2 kernel operate over 4 columns at a time. It computes 4x4 triangular part of matrix first and remainder part is computed in chunk of 4x4 tile upto m rows. - remainder cases(m < 4) are handled serially. AMD-Internal: [CPUPL-1968] Change-Id: I12ae97b2ad673a7fd9b733c607f27b1089142313
This commit is contained in:
@@ -218,9 +218,15 @@ void PASTEMAC(ch,varname) \
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void post_hemv_8x8(double *a, double *x,
|
||||
double *y, double *alpha,
|
||||
dim_t cs_a, dim_t rs_a);
|
||||
void bli_post_hemv_8x8
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t cs_a,
|
||||
dim_t rs_a
|
||||
);
|
||||
|
||||
void bli_dhemv_unf_var1
|
||||
(
|
||||
|
||||
@@ -217,6 +217,17 @@ void PASTEMAC(ch,varname) \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
void bli_pre_hemv_8x8
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t cs_a,
|
||||
dim_t rs_a
|
||||
);
|
||||
|
||||
void bli_dhemv_unf_var3
|
||||
(
|
||||
uplo_t uplo,
|
||||
|
||||
@@ -158,5 +158,217 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
|
||||
/**
|
||||
* Following is function declaration
|
||||
* that computes her2 for transposed case.
|
||||
* It handles triangular part of matrix and
|
||||
* remaining computation in optimal way to
|
||||
* gain performance improvement.
|
||||
* a is triangular matrix, x and y are vectors
|
||||
*/
|
||||
void bli_dher2_trans_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
);
|
||||
|
||||
void bli_dher2_unf_var1
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* x, inc_t incx,
|
||||
double* y, inc_t incy,
|
||||
double* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
double* x0;
|
||||
double* chi1;
|
||||
double* y0;
|
||||
double* psi1;
|
||||
double* c10t;
|
||||
double* gamma11;
|
||||
double alpha0;
|
||||
double alpha1;
|
||||
double alpha0_chi1;
|
||||
double alpha1_psi1;
|
||||
double alpha0_chi1_psi1;
|
||||
double conjx0_chi1;
|
||||
double conjy1_psi1;
|
||||
double conjy0_psi1;
|
||||
dim_t i;
|
||||
dim_t n_behind;
|
||||
inc_t rs_ct, cs_ct;
|
||||
conj_t conj0, conj1;
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case;the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters.
|
||||
*/
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_ct = rs_c;
|
||||
cs_ct = cs_c;
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
PASTEMAC(d,copycjs)( conjh, *alpha, alpha1 );
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_ct = cs_c;
|
||||
cs_ct = rs_c;
|
||||
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being
|
||||
* invoked as her2; for syr2, conjx/conjy are unchanged.
|
||||
*/
|
||||
conjx = bli_apply_conj( conjh, conjx );
|
||||
conjy = bli_apply_conj( conjh, conjy );
|
||||
|
||||
PASTEMAC(d,copycjs)( conjh, *alpha, alpha0 );
|
||||
PASTEMAC(d,copys)( *alpha, alpha1 );
|
||||
}
|
||||
|
||||
/* Apply conjh (which carries the conjugation component of the
|
||||
* Hermitian transpose, if applicable) to conjx and/or conjy as
|
||||
* needed to arrive at the effective conjugation for the vector
|
||||
* subproblems.
|
||||
*/
|
||||
conj0 = bli_apply_conj( conjh, conjy );
|
||||
conj1 = bli_apply_conj( conjh, conjx );
|
||||
|
||||
PASTECH(d,axpy2v_ker_ft) kfp_2v;
|
||||
|
||||
/* Query the context for the kernel function pointer. */
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
|
||||
|
||||
if( (incx == 1) && (incy == 1) && (rs_ct == 1))
|
||||
{
|
||||
for ( i = 0; i < m; )
|
||||
{
|
||||
n_behind = i;
|
||||
x0 = x + (0 )*incx;
|
||||
chi1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
psi1 = y + (i )*incy;
|
||||
c10t = c + (i )*rs_ct + (0 )*cs_ct;
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
|
||||
|
||||
if((n_behind >= 3))
|
||||
{
|
||||
bli_dher2_trans_zen_int_4(c10t, x0, y0, &alpha0, n_behind + 1, cs_ct);
|
||||
i+=4;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
|
||||
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
|
||||
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
|
||||
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have already been conjugated, if needed,
|
||||
* by conjx and conjy.
|
||||
*/
|
||||
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c10t = c10t + alpha * chi1 * y0'; */
|
||||
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
|
||||
kfp_2v
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
n_behind,
|
||||
&alpha0_chi1,
|
||||
&alpha1_psi1,
|
||||
y0, incy,
|
||||
x0, incx,
|
||||
c10t, cs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
|
||||
+ conj(alpha) * psi1 * conj(chi1); */
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
|
||||
i+=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < m; ++i )
|
||||
{
|
||||
n_behind = i;
|
||||
x0 = x + (0 )*incx;
|
||||
chi1 = x + (i )*incx;
|
||||
y0 = y + (0 )*incy;
|
||||
psi1 = y + (i )*incy;
|
||||
c10t = c + (i )*rs_ct + (0 )*cs_ct;
|
||||
gamma11 = c + (i )*rs_ct + (i )*cs_ct;
|
||||
|
||||
/* Apply conjx and/or conjy to chi1 and/or psi1. */
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, conjx0_chi1 );
|
||||
PASTEMAC(d,copycjs)( conjy, *psi1, conjy1_psi1 );
|
||||
PASTEMAC(d,copycjs)( conj0, *psi1, conjy0_psi1 );
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 );
|
||||
PASTEMAC(d,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have already been conjugated, if needed,
|
||||
* by conjx and conjy.
|
||||
*/
|
||||
PASTEMAC(d,scal2s)( alpha0_chi1, conjy0_psi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c10t = c10t + alpha * chi1 * y0'; */
|
||||
/* c10t = c10t + conj(alpha) * psi1 * x0'; */
|
||||
kfp_2v
|
||||
(
|
||||
conj0,
|
||||
conj1,
|
||||
n_behind,
|
||||
&alpha0_chi1,
|
||||
&alpha1_psi1,
|
||||
y0, incy,
|
||||
x0, incx,
|
||||
c10t, cs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1)
|
||||
+ conj(alpha) * psi1 * conj(chi1); */
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, her2_unf_var1)
|
||||
GENTFUNC(scomplex, c, her2_unf_var1)
|
||||
GENTFUNC(dcomplex, z,her2_unf_var1)
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -166,5 +166,192 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
/**
|
||||
* Following is function declaration
|
||||
* that computes her2 for transposed case.
|
||||
* It handles triangular part of matrix and
|
||||
* remaining computation in optimal way to
|
||||
* gain performance improvement.
|
||||
* a is triangular matrix, x and y are vectors
|
||||
*/
|
||||
void bli_dher2_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
);
|
||||
|
||||
void bli_dher2_unf_var4
|
||||
(
|
||||
uplo_t uplo,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
conj_t conjh,
|
||||
dim_t m,
|
||||
double* alpha,
|
||||
double* x, inc_t incx,
|
||||
double* y, inc_t incy,
|
||||
double* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* chi1;
|
||||
double* x2;
|
||||
double* psi1;
|
||||
double* y2;
|
||||
double* gamma11;
|
||||
double* c21;
|
||||
double alpha0;
|
||||
double alpha0_psi1;
|
||||
double alpha1_chi1;
|
||||
double alpha0_chi1_psi1;
|
||||
dim_t i;
|
||||
dim_t n_ahead;
|
||||
inc_t rs_ct, cs_ct;
|
||||
|
||||
const num_t dt = PASTEMAC(d,type);
|
||||
|
||||
/* The algorithm will be expressed in terms of the lower triangular
|
||||
* case; the upper triangular case is supported by swapping the row
|
||||
* and column strides of A and toggling some conj parameters.
|
||||
*/
|
||||
if ( bli_is_lower( uplo ) )
|
||||
{
|
||||
rs_ct = rs_c;
|
||||
cs_ct = cs_c;
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
}
|
||||
else /* if ( bli_is_upper( uplo ) ) */
|
||||
{
|
||||
rs_ct = cs_c;
|
||||
cs_ct = rs_c;
|
||||
|
||||
/* Toggle conjugation of conjx/conjy, but only if we are being
|
||||
* invoked as her2; for syr2, conjx/conjy are unchanged.
|
||||
*/
|
||||
|
||||
PASTEMAC(d,copys)( *alpha, alpha0 );
|
||||
}
|
||||
/* Apply conjh (which carries the conjugation component of the
|
||||
* Hermitian transpose, if applicable) to conjx and/or conjy as
|
||||
* needed to arrive at the effective conjugation for the vector
|
||||
* subproblems.
|
||||
*/
|
||||
|
||||
PASTECH(d,axpy2v_ker_ft) kfp_2v;
|
||||
|
||||
/* Query the context for the kernel function pointer. */
|
||||
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
|
||||
|
||||
if((incx == 1) && (incy == 1) && (rs_ct == 1))
|
||||
{
|
||||
for ( i = 0; i < m; )
|
||||
{
|
||||
n_ahead = m - i - 1;
|
||||
chi1 = x + (i ) * incx;
|
||||
x2 = x + (i+1) * incx;
|
||||
psi1 = y + (i ) * incy;
|
||||
y2 = y + (i+1) * incy;
|
||||
gamma11 = c + (i ) + (i )*cs_ct;
|
||||
c21 = c + (i+1) + (i )*cs_ct;
|
||||
|
||||
if((n_ahead >= 3))
|
||||
{
|
||||
bli_dher2_zen_int_4(gamma11, chi1, psi1, &alpha0, n_ahead + 1, cs_ct);
|
||||
i+= 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
|
||||
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have
|
||||
already been conjugated, if needed, by conjx and
|
||||
conjy. */
|
||||
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c21 = c21 + alpha * x2 * conj(psi1); */
|
||||
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
|
||||
|
||||
kfp_2v
|
||||
(
|
||||
conjx,
|
||||
conjy,
|
||||
n_ahead,
|
||||
&alpha0_psi1,
|
||||
&alpha1_chi1,
|
||||
x2, incx,
|
||||
y2, incy,
|
||||
c21, rs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
i+=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < m; ++i)
|
||||
{
|
||||
n_ahead = m - i - 1;
|
||||
chi1 = x + (i ) * incx;
|
||||
x2 = x + (i+1) * incx;
|
||||
psi1 = y + (i ) * incy;
|
||||
y2 = y + (i+1) * incy;
|
||||
gamma11 = c + (i ) + (i )*cs_ct;
|
||||
c21 = c + (i+1) + (i )*cs_ct;
|
||||
|
||||
/* Compute scalars for vector subproblems. */
|
||||
PASTEMAC(d,scal2s)( alpha0, *psi1, alpha0_psi1 );
|
||||
PASTEMAC(d,scal2s)( alpha0, *chi1, alpha1_chi1 );
|
||||
|
||||
/* Compute alpha * chi1 * conj(psi1) after both chi1
|
||||
* and psi1 have
|
||||
already been conjugated, if needed, by conjx and
|
||||
conjy. */
|
||||
PASTEMAC(d,scal2s)( alpha0_psi1, *chi1,
|
||||
alpha0_chi1_psi1 );
|
||||
|
||||
/* c21 = c21 + alpha * x2 * conj(psi1); */
|
||||
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */
|
||||
|
||||
kfp_2v
|
||||
(
|
||||
conjx,
|
||||
conjy,
|
||||
n_ahead,
|
||||
&alpha0_psi1,
|
||||
&alpha1_chi1,
|
||||
x2, incx,
|
||||
y2, incy,
|
||||
c21, rs_ct,
|
||||
cntx
|
||||
);
|
||||
|
||||
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
PASTEMAC(d,adds)( alpha0_chi1_psi1, *gamma11 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GENTFUNC(float, s, her2_unf_var4)
|
||||
GENTFUNC(scomplex, c, her2_unf_var4)
|
||||
GENTFUNC(dcomplex, z,her2_unf_var4)
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_ref.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_zen_int_4.c
|
||||
)
|
||||
|
||||
|
||||
|
||||
396
kernels/zen/2/bli_her2_zen_int_4.c
Normal file
396
kernels/zen/2/bli_her2_zen_int_4.c
Normal file
@@ -0,0 +1,396 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
void bli_dher2_trans_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
)
|
||||
{
|
||||
dim_t row = 0;
|
||||
dim_t rem = m % 4;
|
||||
|
||||
/*holds 4 diagonal elements of triangular part of 4x4 tile*/
|
||||
double a_diag[4] = {0};
|
||||
/*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
|
||||
double alpha_chi[4] = {0};
|
||||
double alpha_psi[4] = {0};
|
||||
/*Extracts diagonal element and store into a_diag buffer*/
|
||||
PRAGMA_SIMD
|
||||
for(dim_t i = 0; i < 4; i++)
|
||||
{
|
||||
a_diag[i] = *(a + m + i + (i * lda));
|
||||
}
|
||||
|
||||
__m256d x0, x1, x2, x3;
|
||||
__m256d y0, y1, y2, y3;
|
||||
|
||||
__m256d xr, yr, zero, gamma;
|
||||
__m256d a0, a1, a2, a3;
|
||||
|
||||
zero = _mm256_setzero_pd();
|
||||
|
||||
/*Loading elements of x & y vectors*/
|
||||
x0 = _mm256_loadu_pd(x + m);
|
||||
y0 = _mm256_loadu_pd(y + m);
|
||||
/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
|
||||
x1 = _mm256_broadcast_sd(alpha);
|
||||
|
||||
x2 = _mm256_mul_pd(x0, x1);
|
||||
y0 = _mm256_mul_pd(y0, x1);
|
||||
|
||||
/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
|
||||
_mm256_storeu_pd(alpha_chi, x2);
|
||||
_mm256_storeu_pd(alpha_psi, y0);
|
||||
|
||||
x0 = _mm256_mul_pd(x0, y0);
|
||||
gamma = _mm256_loadu_pd(a_diag);
|
||||
gamma = _mm256_add_pd(gamma, x0);
|
||||
gamma = _mm256_add_pd(gamma, x0);
|
||||
_mm256_storeu_pd(a_diag, gamma);
|
||||
|
||||
/* Broadcasting 4 alpha_psis and alpha_chis which
|
||||
* are to be used througout the computation of 4x4 tile
|
||||
* upto m rows.
|
||||
*/
|
||||
x0 = _mm256_broadcast_sd(&alpha_chi[0]);
|
||||
x1 = _mm256_broadcast_sd(&alpha_chi[1]);
|
||||
x2 = _mm256_broadcast_sd(&alpha_chi[2]);
|
||||
x3 = _mm256_broadcast_sd(&alpha_chi[3]);
|
||||
|
||||
y0 = _mm256_broadcast_sd(&alpha_psi[0]);
|
||||
y1 = _mm256_broadcast_sd(&alpha_psi[1]);
|
||||
y2 = _mm256_broadcast_sd(&alpha_psi[2]);
|
||||
y3 = _mm256_broadcast_sd(&alpha_psi[3]);
|
||||
|
||||
/* Loading 4x4 tile of A matrix for
|
||||
* triangular part computation
|
||||
*/
|
||||
a0 = _mm256_loadu_pd(a + (0 * lda) + m);
|
||||
a1 = _mm256_loadu_pd(a + (1 * lda) + m);
|
||||
a2 = _mm256_loadu_pd(a + (2 * lda) + m);
|
||||
a3 = _mm256_loadu_pd(a + (3 * lda) + m);
|
||||
|
||||
yr = _mm256_loadu_pd(y);
|
||||
xr = _mm256_loadu_pd(x);
|
||||
|
||||
/*Setting first element of x & y vectors to zero
|
||||
* to eliminate diagonal element of 1st column
|
||||
* from computation
|
||||
*/
|
||||
xr = _mm256_blend_pd(xr, zero, 0x1);
|
||||
yr = _mm256_blend_pd(yr, zero, 0x1);
|
||||
a0 = _mm256_blend_pd(a0, zero, 0x1);
|
||||
|
||||
a1 = _mm256_blend_pd(a1, zero, 0x3);
|
||||
a2 = _mm256_blend_pd(a2, zero, 0x7);
|
||||
a3 = _mm256_blend_pd(a3, zero, 0xF);
|
||||
|
||||
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
||||
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
||||
|
||||
/*Setting two elements of x & y vectors to zero
|
||||
* to eliminate diagonal element of 2nd column
|
||||
* from computation
|
||||
*/
|
||||
xr = _mm256_blend_pd(xr, zero, 0x3);
|
||||
yr = _mm256_blend_pd(yr, zero, 0x3);
|
||||
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
||||
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
||||
|
||||
/*Setting three elements of x & y vectors to zero
|
||||
* to eliminate diagonal element of 3rd column
|
||||
* from computation
|
||||
*/
|
||||
xr = _mm256_blend_pd(xr, zero, 0x7);
|
||||
yr = _mm256_blend_pd(yr, zero, 0x7);
|
||||
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
||||
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
||||
|
||||
_mm256_storeu_pd(a + (0 * lda) + m, a0 );
|
||||
|
||||
/* Loading data from memory location first
|
||||
* so it could be blend with and finally
|
||||
* gets stored at same location to prevent
|
||||
* unnecessary data overwriting at nearby
|
||||
* memory locations
|
||||
*/
|
||||
a3 = _mm256_loadu_pd(a + (1 * lda) + m );
|
||||
a1 = _mm256_blend_pd(a1, a3, 0x1);
|
||||
_mm256_storeu_pd(a + (1 * lda) + m, a1 );
|
||||
|
||||
a3 = _mm256_loadu_pd(a + (2 * lda) + m );
|
||||
a2 = _mm256_blend_pd(a2, a3, 0x3);
|
||||
_mm256_storeu_pd(a + (2 * lda) + m, a2 );
|
||||
|
||||
/* Triangular part of matrix is computed, remaining
|
||||
* part is computed in below loop upto m rows.
|
||||
*/
|
||||
for(; (row + 4) <= m; row+=4)
|
||||
{
|
||||
/* Loading elements of x and y vector */
|
||||
xr = _mm256_loadu_pd(x + row);
|
||||
yr = _mm256_loadu_pd(y + row);
|
||||
/* Loading tile of A matrix of size 4x4 */
|
||||
a0 = _mm256_loadu_pd(a + row + (0 * lda) );
|
||||
a1 = _mm256_loadu_pd(a + row + (1 * lda) );
|
||||
a2 = _mm256_loadu_pd(a + row + (2 * lda) );
|
||||
a3 = _mm256_loadu_pd(a + row + (3 * lda) );
|
||||
|
||||
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
||||
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
||||
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
||||
a3 = _mm256_fmadd_pd(xr, y3, a3);
|
||||
|
||||
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
||||
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
||||
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
||||
a3 = _mm256_fmadd_pd(yr, x3, a3);
|
||||
|
||||
_mm256_storeu_pd(a + row + (0 * lda), a0);
|
||||
_mm256_storeu_pd(a + row + (1 * lda), a1);
|
||||
_mm256_storeu_pd(a + row + (2 * lda), a2);
|
||||
_mm256_storeu_pd(a + row + (3 * lda), a3);
|
||||
}
|
||||
|
||||
/* Computes remainder cases where m is less than 4 */
|
||||
if(rem)
|
||||
{
|
||||
PRAGMA_SIMD
|
||||
for(dim_t i = 0; i < 4; i++)
|
||||
{
|
||||
for(dim_t j = row; j < m; j++)
|
||||
{
|
||||
a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
|
||||
a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Computing 4 diagonal elements of triangular part of matrix
|
||||
* and storing result back at corresponding location in matrix A
|
||||
*/
|
||||
PRAGMA_SIMD
|
||||
for(dim_t i = 0; i < 4; i++)
|
||||
{
|
||||
*(a + m + i + (i * lda)) = a_diag[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void bli_dher2_zen_int_4
|
||||
(
|
||||
double *a,
|
||||
double *x,
|
||||
double *y,
|
||||
double *alpha,
|
||||
dim_t m,
|
||||
dim_t lda
|
||||
)
|
||||
{
|
||||
dim_t row = 4;
|
||||
dim_t rem = m % 4;
|
||||
|
||||
/*holds 4 diagonal elements of triangular part of 4x4 tile*/
|
||||
double a_diag[4] = {0};
|
||||
/*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
|
||||
double alpha_chi[4] = {0};
|
||||
double alpha_psi[4] = {0};
|
||||
/*Extracts diagonal element and store into a_diag buffer*/
|
||||
PRAGMA_SIMD
|
||||
for(dim_t i = 0; i < 4; i++)
|
||||
{
|
||||
a_diag[i] = *(a + i + (i * lda));
|
||||
}
|
||||
|
||||
__m256d x0, x1, x2, x3;
|
||||
__m256d y0, y1, y2, y3;
|
||||
|
||||
__m256d xr, yr, zero, gamma;
|
||||
__m256d a0, a1, a2, a3;
|
||||
|
||||
zero = _mm256_setzero_pd();
|
||||
|
||||
/*Loading elements of x & y vectors*/
|
||||
x0 = _mm256_loadu_pd(x);
|
||||
y0 = _mm256_loadu_pd(y);
|
||||
/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
|
||||
x1 = _mm256_broadcast_sd(alpha);
|
||||
|
||||
x2 = _mm256_mul_pd(x0, x1);
|
||||
y0 = _mm256_mul_pd(y0, x1);
|
||||
|
||||
/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
|
||||
_mm256_storeu_pd(alpha_chi, x2);
|
||||
_mm256_storeu_pd(alpha_psi, y0);
|
||||
|
||||
x0 = _mm256_mul_pd(x0, y0);
|
||||
gamma = _mm256_loadu_pd(a_diag);
|
||||
gamma = _mm256_add_pd(gamma, x0);
|
||||
gamma = _mm256_add_pd(gamma, x0);
|
||||
_mm256_storeu_pd(a_diag, gamma);
|
||||
|
||||
/* Broadcasting 4 alpha_psis and alpha_chis which
|
||||
* are to be used througout the computation of 4x4 tile
|
||||
* upto m rows.
|
||||
*/
|
||||
x0 = _mm256_broadcast_sd(&alpha_chi[0]);
|
||||
x1 = _mm256_broadcast_sd(&alpha_chi[1]);
|
||||
x2 = _mm256_broadcast_sd(&alpha_chi[2]);
|
||||
x3 = _mm256_broadcast_sd(&alpha_chi[3]);
|
||||
|
||||
y0 = _mm256_broadcast_sd(&alpha_psi[0]);
|
||||
y1 = _mm256_broadcast_sd(&alpha_psi[1]);
|
||||
y2 = _mm256_broadcast_sd(&alpha_psi[2]);
|
||||
y3 = _mm256_broadcast_sd(&alpha_psi[3]);
|
||||
|
||||
/* Loading 4x4 tile of A matrix for
|
||||
* triangular part computation
|
||||
*/
|
||||
a0 = _mm256_loadu_pd(a + (0 * lda) );
|
||||
a1 = _mm256_loadu_pd(a + (1 * lda) );
|
||||
a2 = _mm256_loadu_pd(a + (2 * lda) );
|
||||
a3 = _mm256_loadu_pd(a + (3 * lda) );
|
||||
|
||||
yr = _mm256_loadu_pd(y);
|
||||
xr = _mm256_loadu_pd(x);
|
||||
|
||||
/*Setting first element of x & y vectors to zero
|
||||
* to eliminate diagonal element of 1st column
|
||||
* from computation
|
||||
*/
|
||||
xr = _mm256_blend_pd(xr, zero, 0x1);
|
||||
yr = _mm256_blend_pd(yr, zero, 0x1);
|
||||
a0 = _mm256_blend_pd(a0, zero, 0x1);
|
||||
a1 = _mm256_blend_pd(a1, zero, 0x3);
|
||||
a2 = _mm256_blend_pd(a2, zero, 0x7);
|
||||
a3 = _mm256_blend_pd(a3, zero, 0xF);
|
||||
|
||||
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
||||
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
||||
|
||||
/*Setting two elements of x & y vectors to zero
|
||||
* to eliminate diagonal element of 2nd column
|
||||
* from computation
|
||||
*/
|
||||
xr = _mm256_blend_pd(xr, zero, 0x3);
|
||||
yr = _mm256_blend_pd(yr, zero, 0x3);
|
||||
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
||||
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
||||
|
||||
/*Setting three elements of x & y vectors to zero
|
||||
* to eliminate diagonal element of 3rd column
|
||||
* from computation
|
||||
*/
|
||||
xr = _mm256_blend_pd(xr, zero, 0x7);
|
||||
yr = _mm256_blend_pd(yr, zero, 0x7);
|
||||
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
||||
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
||||
|
||||
_mm256_storeu_pd(a + (0 * lda), a0 );
|
||||
|
||||
/* Loading data from memory location first
|
||||
* so it could be blend with and finally
|
||||
* gets stored at same location to prevent
|
||||
* unnecessary data overwriting at nearby
|
||||
* memory locations
|
||||
*/
|
||||
a3 = _mm256_loadu_pd(a + (1 * lda) );
|
||||
a1 = _mm256_blend_pd(a1, a3, 0x1);
|
||||
_mm256_storeu_pd(a + (1 * lda), a1 );
|
||||
|
||||
a3 = _mm256_loadu_pd(a + (2 * lda) );
|
||||
a2 = _mm256_blend_pd(a2, a3, 0x3);
|
||||
_mm256_storeu_pd(a + (2 * lda), a2 );
|
||||
|
||||
/* Triangular part of matrix is computed, remaining
|
||||
* part is computed in below loop upto m rows.
|
||||
*/
|
||||
for(; (row + 4) <= m; row+=4)
|
||||
{
|
||||
/* Loading elements of x and y vector */
|
||||
xr = _mm256_loadu_pd(x + row);
|
||||
yr = _mm256_loadu_pd(y + row);
|
||||
/* Loading tile of A matrix of size 4x4 */
|
||||
a0 = _mm256_loadu_pd(a + row + (0 * lda) );
|
||||
a1 = _mm256_loadu_pd(a + row + (1 * lda) );
|
||||
a2 = _mm256_loadu_pd(a + row + (2 * lda) );
|
||||
a3 = _mm256_loadu_pd(a + row + (3 * lda) );
|
||||
|
||||
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
||||
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
||||
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
||||
a3 = _mm256_fmadd_pd(xr, y3, a3);
|
||||
|
||||
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
||||
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
||||
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
||||
a3 = _mm256_fmadd_pd(yr, x3, a3);
|
||||
|
||||
_mm256_storeu_pd(a + row + (0 * lda), a0);
|
||||
_mm256_storeu_pd(a + row + (1 * lda), a1);
|
||||
_mm256_storeu_pd(a + row + (2 * lda), a2);
|
||||
_mm256_storeu_pd(a + row + (3 * lda), a3);
|
||||
}
|
||||
|
||||
/* Computes remainder cases where m is less than 4 */
|
||||
if(rem)
|
||||
{
|
||||
PRAGMA_SIMD
|
||||
for(dim_t i = 0; i < 4; i++)
|
||||
{
|
||||
for(dim_t j = row; j < m; j++)
|
||||
{
|
||||
a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
|
||||
a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Computing 4 diagonal elements of triangular part of matrix
|
||||
* and storing result back at corresponding location in matrix A
|
||||
*/
|
||||
PRAGMA_SIMD
|
||||
for(dim_t i = 0; i < 4; i++)
|
||||
{
|
||||
*(a + i + (i * lda)) = a_diag[i];
|
||||
}
|
||||
}
|
||||
@@ -32,16 +32,6 @@
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
// hemv helper function
|
||||
void bli_pre_hemv_8x8(double *a, double *x,
|
||||
double *y, double *alpha,
|
||||
dim_t cs_a, dim_t rs_a);
|
||||
|
||||
void bli_post_hemv_8x8(double *a, double *x,
|
||||
double *y, double *alpha,
|
||||
dim_t cs_a, dim_t rs_a);
|
||||
|
||||
|
||||
// -- level-1m --
|
||||
PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
|
||||
PACKM_KER_PROT(double, d, packm_6xk_gen_zen)
|
||||
|
||||
Reference in New Issue
Block a user