mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
- Impplemented her2 framework calls for transposed and non transposed kernel variants. - dher2 kernel operate over 4 columns at a time. It computes 4x4 triangular part of matrix first and remainder part is computed in chunk of 4x4 tile upto m rows. - remainder cases(m < 4) are handled serially. AMD-Internal: [CPUPL-1968] Change-Id: I12ae97b2ad673a7fd9b733c607f27b1089142313
397 lines
11 KiB
C
397 lines
11 KiB
C
/*
|
|
|
|
BLIS
|
|
An object-based framework for developing high-performance BLAS-like
|
|
libraries.
|
|
|
|
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
- Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
#include "immintrin.h"
|
|
#include "blis.h"
|
|
|
|
void bli_dher2_trans_zen_int_4
|
|
(
|
|
double *a,
|
|
double *x,
|
|
double *y,
|
|
double *alpha,
|
|
dim_t m,
|
|
dim_t lda
|
|
)
|
|
{
|
|
dim_t row = 0;
|
|
dim_t rem = m % 4;
|
|
|
|
/*holds 4 diagonal elements of triangular part of 4x4 tile*/
|
|
double a_diag[4] = {0};
|
|
/*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
|
|
double alpha_chi[4] = {0};
|
|
double alpha_psi[4] = {0};
|
|
/*Extracts diagonal element and store into a_diag buffer*/
|
|
PRAGMA_SIMD
|
|
for(dim_t i = 0; i < 4; i++)
|
|
{
|
|
a_diag[i] = *(a + m + i + (i * lda));
|
|
}
|
|
|
|
__m256d x0, x1, x2, x3;
|
|
__m256d y0, y1, y2, y3;
|
|
|
|
__m256d xr, yr, zero, gamma;
|
|
__m256d a0, a1, a2, a3;
|
|
|
|
zero = _mm256_setzero_pd();
|
|
|
|
/*Loading elements of x & y vectors*/
|
|
x0 = _mm256_loadu_pd(x + m);
|
|
y0 = _mm256_loadu_pd(y + m);
|
|
/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
|
|
x1 = _mm256_broadcast_sd(alpha);
|
|
|
|
x2 = _mm256_mul_pd(x0, x1);
|
|
y0 = _mm256_mul_pd(y0, x1);
|
|
|
|
/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
|
|
_mm256_storeu_pd(alpha_chi, x2);
|
|
_mm256_storeu_pd(alpha_psi, y0);
|
|
|
|
x0 = _mm256_mul_pd(x0, y0);
|
|
gamma = _mm256_loadu_pd(a_diag);
|
|
gamma = _mm256_add_pd(gamma, x0);
|
|
gamma = _mm256_add_pd(gamma, x0);
|
|
_mm256_storeu_pd(a_diag, gamma);
|
|
|
|
/* Broadcasting 4 alpha_psis and alpha_chis which
|
|
* are to be used througout the computation of 4x4 tile
|
|
* upto m rows.
|
|
*/
|
|
x0 = _mm256_broadcast_sd(&alpha_chi[0]);
|
|
x1 = _mm256_broadcast_sd(&alpha_chi[1]);
|
|
x2 = _mm256_broadcast_sd(&alpha_chi[2]);
|
|
x3 = _mm256_broadcast_sd(&alpha_chi[3]);
|
|
|
|
y0 = _mm256_broadcast_sd(&alpha_psi[0]);
|
|
y1 = _mm256_broadcast_sd(&alpha_psi[1]);
|
|
y2 = _mm256_broadcast_sd(&alpha_psi[2]);
|
|
y3 = _mm256_broadcast_sd(&alpha_psi[3]);
|
|
|
|
/* Loading 4x4 tile of A matrix for
|
|
* triangular part computation
|
|
*/
|
|
a0 = _mm256_loadu_pd(a + (0 * lda) + m);
|
|
a1 = _mm256_loadu_pd(a + (1 * lda) + m);
|
|
a2 = _mm256_loadu_pd(a + (2 * lda) + m);
|
|
a3 = _mm256_loadu_pd(a + (3 * lda) + m);
|
|
|
|
yr = _mm256_loadu_pd(y);
|
|
xr = _mm256_loadu_pd(x);
|
|
|
|
/*Setting first element of x & y vectors to zero
|
|
* to eliminate diagonal element of 1st column
|
|
* from computation
|
|
*/
|
|
xr = _mm256_blend_pd(xr, zero, 0x1);
|
|
yr = _mm256_blend_pd(yr, zero, 0x1);
|
|
a0 = _mm256_blend_pd(a0, zero, 0x1);
|
|
|
|
a1 = _mm256_blend_pd(a1, zero, 0x3);
|
|
a2 = _mm256_blend_pd(a2, zero, 0x7);
|
|
a3 = _mm256_blend_pd(a3, zero, 0xF);
|
|
|
|
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
|
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
|
|
|
/*Setting two elements of x & y vectors to zero
|
|
* to eliminate diagonal element of 2nd column
|
|
* from computation
|
|
*/
|
|
xr = _mm256_blend_pd(xr, zero, 0x3);
|
|
yr = _mm256_blend_pd(yr, zero, 0x3);
|
|
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
|
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
|
|
|
/*Setting three elements of x & y vectors to zero
|
|
* to eliminate diagonal element of 3rd column
|
|
* from computation
|
|
*/
|
|
xr = _mm256_blend_pd(xr, zero, 0x7);
|
|
yr = _mm256_blend_pd(yr, zero, 0x7);
|
|
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
|
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
|
|
|
_mm256_storeu_pd(a + (0 * lda) + m, a0 );
|
|
|
|
/* Loading data from memory location first
|
|
* so it could be blend with and finally
|
|
* gets stored at same location to prevent
|
|
* unnecessary data overwriting at nearby
|
|
* memory locations
|
|
*/
|
|
a3 = _mm256_loadu_pd(a + (1 * lda) + m );
|
|
a1 = _mm256_blend_pd(a1, a3, 0x1);
|
|
_mm256_storeu_pd(a + (1 * lda) + m, a1 );
|
|
|
|
a3 = _mm256_loadu_pd(a + (2 * lda) + m );
|
|
a2 = _mm256_blend_pd(a2, a3, 0x3);
|
|
_mm256_storeu_pd(a + (2 * lda) + m, a2 );
|
|
|
|
/* Triangular part of matrix is computed, remaining
|
|
* part is computed in below loop upto m rows.
|
|
*/
|
|
for(; (row + 4) <= m; row+=4)
|
|
{
|
|
/* Loading elements of x and y vector */
|
|
xr = _mm256_loadu_pd(x + row);
|
|
yr = _mm256_loadu_pd(y + row);
|
|
/* Loading tile of A matrix of size 4x4 */
|
|
a0 = _mm256_loadu_pd(a + row + (0 * lda) );
|
|
a1 = _mm256_loadu_pd(a + row + (1 * lda) );
|
|
a2 = _mm256_loadu_pd(a + row + (2 * lda) );
|
|
a3 = _mm256_loadu_pd(a + row + (3 * lda) );
|
|
|
|
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
|
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
|
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
|
a3 = _mm256_fmadd_pd(xr, y3, a3);
|
|
|
|
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
|
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
|
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
|
a3 = _mm256_fmadd_pd(yr, x3, a3);
|
|
|
|
_mm256_storeu_pd(a + row + (0 * lda), a0);
|
|
_mm256_storeu_pd(a + row + (1 * lda), a1);
|
|
_mm256_storeu_pd(a + row + (2 * lda), a2);
|
|
_mm256_storeu_pd(a + row + (3 * lda), a3);
|
|
}
|
|
|
|
/* Computes remainder cases where m is less than 4 */
|
|
if(rem)
|
|
{
|
|
PRAGMA_SIMD
|
|
for(dim_t i = 0; i < 4; i++)
|
|
{
|
|
for(dim_t j = row; j < m; j++)
|
|
{
|
|
a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
|
|
a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Computing 4 diagonal elements of triangular part of matrix
|
|
* and storing result back at corresponding location in matrix A
|
|
*/
|
|
PRAGMA_SIMD
|
|
for(dim_t i = 0; i < 4; i++)
|
|
{
|
|
*(a + m + i + (i * lda)) = a_diag[i];
|
|
}
|
|
}
|
|
|
|
|
|
void bli_dher2_zen_int_4
|
|
(
|
|
double *a,
|
|
double *x,
|
|
double *y,
|
|
double *alpha,
|
|
dim_t m,
|
|
dim_t lda
|
|
)
|
|
{
|
|
dim_t row = 4;
|
|
dim_t rem = m % 4;
|
|
|
|
/*holds 4 diagonal elements of triangular part of 4x4 tile*/
|
|
double a_diag[4] = {0};
|
|
/*alpha_chi holds x*alpha and alpha_psi holds y*alpha*/
|
|
double alpha_chi[4] = {0};
|
|
double alpha_psi[4] = {0};
|
|
/*Extracts diagonal element and store into a_diag buffer*/
|
|
PRAGMA_SIMD
|
|
for(dim_t i = 0; i < 4; i++)
|
|
{
|
|
a_diag[i] = *(a + i + (i * lda));
|
|
}
|
|
|
|
__m256d x0, x1, x2, x3;
|
|
__m256d y0, y1, y2, y3;
|
|
|
|
__m256d xr, yr, zero, gamma;
|
|
__m256d a0, a1, a2, a3;
|
|
|
|
zero = _mm256_setzero_pd();
|
|
|
|
/*Loading elements of x & y vectors*/
|
|
x0 = _mm256_loadu_pd(x);
|
|
y0 = _mm256_loadu_pd(y);
|
|
/*Broadcasting alpha to compute alpha_psi and alpha_chi*/
|
|
x1 = _mm256_broadcast_sd(alpha);
|
|
|
|
x2 = _mm256_mul_pd(x0, x1);
|
|
y0 = _mm256_mul_pd(y0, x1);
|
|
|
|
/*Storing alpha_chi and alpha_psi for later usage in computation loop*/
|
|
_mm256_storeu_pd(alpha_chi, x2);
|
|
_mm256_storeu_pd(alpha_psi, y0);
|
|
|
|
x0 = _mm256_mul_pd(x0, y0);
|
|
gamma = _mm256_loadu_pd(a_diag);
|
|
gamma = _mm256_add_pd(gamma, x0);
|
|
gamma = _mm256_add_pd(gamma, x0);
|
|
_mm256_storeu_pd(a_diag, gamma);
|
|
|
|
/* Broadcasting 4 alpha_psis and alpha_chis which
|
|
* are to be used througout the computation of 4x4 tile
|
|
* upto m rows.
|
|
*/
|
|
x0 = _mm256_broadcast_sd(&alpha_chi[0]);
|
|
x1 = _mm256_broadcast_sd(&alpha_chi[1]);
|
|
x2 = _mm256_broadcast_sd(&alpha_chi[2]);
|
|
x3 = _mm256_broadcast_sd(&alpha_chi[3]);
|
|
|
|
y0 = _mm256_broadcast_sd(&alpha_psi[0]);
|
|
y1 = _mm256_broadcast_sd(&alpha_psi[1]);
|
|
y2 = _mm256_broadcast_sd(&alpha_psi[2]);
|
|
y3 = _mm256_broadcast_sd(&alpha_psi[3]);
|
|
|
|
/* Loading 4x4 tile of A matrix for
|
|
* triangular part computation
|
|
*/
|
|
a0 = _mm256_loadu_pd(a + (0 * lda) );
|
|
a1 = _mm256_loadu_pd(a + (1 * lda) );
|
|
a2 = _mm256_loadu_pd(a + (2 * lda) );
|
|
a3 = _mm256_loadu_pd(a + (3 * lda) );
|
|
|
|
yr = _mm256_loadu_pd(y);
|
|
xr = _mm256_loadu_pd(x);
|
|
|
|
/*Setting first element of x & y vectors to zero
|
|
* to eliminate diagonal element of 1st column
|
|
* from computation
|
|
*/
|
|
xr = _mm256_blend_pd(xr, zero, 0x1);
|
|
yr = _mm256_blend_pd(yr, zero, 0x1);
|
|
a0 = _mm256_blend_pd(a0, zero, 0x1);
|
|
a1 = _mm256_blend_pd(a1, zero, 0x3);
|
|
a2 = _mm256_blend_pd(a2, zero, 0x7);
|
|
a3 = _mm256_blend_pd(a3, zero, 0xF);
|
|
|
|
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
|
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
|
|
|
/*Setting two elements of x & y vectors to zero
|
|
* to eliminate diagonal element of 2nd column
|
|
* from computation
|
|
*/
|
|
xr = _mm256_blend_pd(xr, zero, 0x3);
|
|
yr = _mm256_blend_pd(yr, zero, 0x3);
|
|
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
|
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
|
|
|
/*Setting three elements of x & y vectors to zero
|
|
* to eliminate diagonal element of 3rd column
|
|
* from computation
|
|
*/
|
|
xr = _mm256_blend_pd(xr, zero, 0x7);
|
|
yr = _mm256_blend_pd(yr, zero, 0x7);
|
|
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
|
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
|
|
|
_mm256_storeu_pd(a + (0 * lda), a0 );
|
|
|
|
/* Loading data from memory location first
|
|
* so it could be blend with and finally
|
|
* gets stored at same location to prevent
|
|
* unnecessary data overwriting at nearby
|
|
* memory locations
|
|
*/
|
|
a3 = _mm256_loadu_pd(a + (1 * lda) );
|
|
a1 = _mm256_blend_pd(a1, a3, 0x1);
|
|
_mm256_storeu_pd(a + (1 * lda), a1 );
|
|
|
|
a3 = _mm256_loadu_pd(a + (2 * lda) );
|
|
a2 = _mm256_blend_pd(a2, a3, 0x3);
|
|
_mm256_storeu_pd(a + (2 * lda), a2 );
|
|
|
|
/* Triangular part of matrix is computed, remaining
|
|
* part is computed in below loop upto m rows.
|
|
*/
|
|
for(; (row + 4) <= m; row+=4)
|
|
{
|
|
/* Loading elements of x and y vector */
|
|
xr = _mm256_loadu_pd(x + row);
|
|
yr = _mm256_loadu_pd(y + row);
|
|
/* Loading tile of A matrix of size 4x4 */
|
|
a0 = _mm256_loadu_pd(a + row + (0 * lda) );
|
|
a1 = _mm256_loadu_pd(a + row + (1 * lda) );
|
|
a2 = _mm256_loadu_pd(a + row + (2 * lda) );
|
|
a3 = _mm256_loadu_pd(a + row + (3 * lda) );
|
|
|
|
a0 = _mm256_fmadd_pd(xr, y0, a0);
|
|
a1 = _mm256_fmadd_pd(xr, y1, a1);
|
|
a2 = _mm256_fmadd_pd(xr, y2, a2);
|
|
a3 = _mm256_fmadd_pd(xr, y3, a3);
|
|
|
|
a0 = _mm256_fmadd_pd(yr, x0, a0);
|
|
a1 = _mm256_fmadd_pd(yr, x1, a1);
|
|
a2 = _mm256_fmadd_pd(yr, x2, a2);
|
|
a3 = _mm256_fmadd_pd(yr, x3, a3);
|
|
|
|
_mm256_storeu_pd(a + row + (0 * lda), a0);
|
|
_mm256_storeu_pd(a + row + (1 * lda), a1);
|
|
_mm256_storeu_pd(a + row + (2 * lda), a2);
|
|
_mm256_storeu_pd(a + row + (3 * lda), a3);
|
|
}
|
|
|
|
/* Computes remainder cases where m is less than 4 */
|
|
if(rem)
|
|
{
|
|
PRAGMA_SIMD
|
|
for(dim_t i = 0; i < 4; i++)
|
|
{
|
|
for(dim_t j = row; j < m; j++)
|
|
{
|
|
a[ j + (i * lda)] += x[j] * (y[i] * (*alpha));
|
|
a[ j + (i * lda)] += y[j] * (x[i] * (*alpha));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Computing 4 diagonal elements of triangular part of matrix
|
|
* and storing result back at corresponding location in matrix A
|
|
*/
|
|
PRAGMA_SIMD
|
|
for(dim_t i = 0; i < 4; i++)
|
|
{
|
|
*(a + i + (i * lda)) = a_diag[i];
|
|
}
|
|
}
|