From e35d3c23f28784e50ee13d2e77a69d60e0c24c1f Mon Sep 17 00:00:00 2001 From: Kiran Varaganti Date: Thu, 10 Nov 2016 14:30:53 +0530 Subject: [PATCH] Added new optimized micro-kernel for dotxv routine Change-Id: I2c544e9b25a454d971ad690353502a55cd668391 --- config/zen/bli_kernel.h | 3 + kernels/x86_64/zen/1/bli_dotxv_opt_var1.c | 311 ++++++++++++++++++++++ 2 files changed, 314 insertions(+) create mode 100644 kernels/x86_64/zen/1/bli_dotxv_opt_var1.c diff --git a/config/zen/bli_kernel.h b/config/zen/bli_kernel.h index b3bb366ea..9f92fb231 100644 --- a/config/zen/bli_kernel.h +++ b/config/zen/bli_kernel.h @@ -224,7 +224,10 @@ // -- dotv -- #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1 + // -- dotxv -- +#define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1 +#define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1 // -- invertv -- diff --git a/kernels/x86_64/zen/1/bli_dotxv_opt_var1.c b/kernels/x86_64/zen/1/bli_dotxv_opt_var1.c new file mode 100644 index 000000000..7312f8eee --- /dev/null +++ b/kernels/x86_64/zen/1/bli_dotxv_opt_var1.c @@ -0,0 +1,311 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright(C) 2016, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include + +typedef union +{ + __m256 v; + float f[8]; +}v8ff_t; + +typedef union +{ + __m256d v; + double d[4]; +}v4df_t; + +void bli_sdotxv_unb_var1 ( + conj_t conjx, + conj_t conjy, + dim_t n, + float* restrict alpha, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + float* restrict beta, + float* restrict rho, + cntx_t* cntx + ) +{ + float* restrict chi1; + float* restrict psi1; + float dotxy; + dim_t i; + conj_t conjx_use; + + /* If beta is zero, clear rho. Otherwise, scale by beta. */ + if ( ( ((*beta)) == (0.0F) ) ) + { + ((*rho)) = (0.0F); + } + else + { + (( *rho )) = (( *beta )) * (( *rho )); + } + + if ( n == 0 ) return; + + dotxy = (0.0F); + + chi1 = x; + psi1 = y; + + dim_t n_run; + dim_t n_left; + + const dim_t n_elem_per_reg = 8; // 256 bit register can hold 8 floats + const dim_t n_iter_unroll = 4; // In a single loop 4 vectors of length 8 will be processed + + v8ff_t rho1v, rho2v, rho3v, rho4v; + v8ff_t x1v, y1v; + v8ff_t x2v, y2v; + v8ff_t x3v, y3v; + v8ff_t x4v, y4v; + + n_run = (n) / (n_elem_per_reg * n_iter_unroll); + n_left = (n) % (n_elem_per_reg * n_iter_unroll); + + rho1v.v = _mm256_setzero_ps(); + rho2v.v = _mm256_setzero_ps(); + rho3v.v = _mm256_setzero_ps(); + rho4v.v = _mm256_setzero_ps(); + + /* If y must be conjugated, we do so indirectly by first toggling the + effective conjugation of x and then conjugating the resulting dot + product. */ + conjx_use = conjx; + + if ( ( conjy == BLIS_CONJUGATE ) ) + { + conjx_use = ( conjx_use ^ ( 0x1 << 4 ) ); + } + + if (incx == 1 && incy == 1) + { + for (i = 0; i < n_run; ++i) + { + //(( dotxy )) += (( psi1[i] )) * (( chi1[i] )); + + // load the input + x1v.v = _mm256_loadu_ps((float*)chi1); + y1v.v = _mm256_loadu_ps((float*)psi1); + + x2v.v = _mm256_loadu_ps((float*)(chi1 + n_elem_per_reg)); + y2v.v = _mm256_loadu_ps((float*)(psi1 + n_elem_per_reg)); + + x3v.v = _mm256_loadu_ps((float*)(chi1 + 2 * n_elem_per_reg)); + y3v.v = _mm256_loadu_ps((float*)(psi1 + 2 * n_elem_per_reg)); + + x4v.v = _mm256_loadu_ps((float*)(chi1 + 3 * n_elem_per_reg)); + y4v.v = _mm256_loadu_ps((float*)(psi1 + 3 * n_elem_per_reg)); + + // Calculate the dot product + rho1v.v += y1v.v * x1v.v; + rho2v.v += y2v.v * x2v.v; + rho3v.v += y3v.v * x3v.v; + rho4v.v += y4v.v * x4v.v; + + chi1 += (n_elem_per_reg * n_iter_unroll); + psi1 += (n_elem_per_reg * n_iter_unroll); + } + + //accumulate the results + rho1v.v += rho2v.v; + rho3v.v += rho4v.v; + rho1v.v += rho3v.v; + + dotxy += rho1v.f[0] + rho1v.f[1] + rho1v.f[2] + rho1v.f[3] + + rho1v.f[4] + rho1v.f[5] + rho1v.f[6] + rho1v.f[7]; + + if (n_left > 0) + { + for (i = 0; i < n_left; ++i) + { + dotxy += (*chi1) * (*psi1); + + chi1 += incx; + psi1 += incy; + } + } + } + else + { + for (i = 0; i < n; ++i) + { + ((dotxy)) += ((*psi1)) * ((*chi1)); + + chi1 += incx; + psi1 += incy; + } + } + + ((*rho)) += ((*alpha)) * ((dotxy)); +} // End of function + + +void bli_ddotxv_unb_var1 ( + conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict alpha, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + double* restrict beta, + double* restrict rho, + cntx_t* cntx + ) +{ + double* restrict chi1; + double* restrict psi1; + double dotxy; + dim_t i; + conj_t conjx_use; + + /* If beta is zero, clear rho. Otherwise, scale by beta. */ + if ((*beta) == (0.0)) + { + (*rho) = (0.0); + } + else + { + ((*rho)) = ((*beta)) * ((*rho)); + } + + if (n == 0) return; + + dotxy = (0.0); + + chi1 = x; + psi1 = y; + + dim_t n_run; + dim_t n_left; + + const dim_t n_elem_per_reg = 4; // 256 bit register can hold 4 double + const dim_t n_iter_unroll = 4; // In a single loop 4 vectors of length 4 will be processed + + v4df_t rho1v; + v4df_t x1v, y1v; + + v4df_t rho2v; + v4df_t x2v, y2v; + + v4df_t rho3v; + v4df_t x3v, y3v; + + v4df_t rho4v; + v4df_t x4v, y4v; + + rho1v.v = _mm256_setzero_pd(); + rho2v.v = _mm256_setzero_pd(); + rho3v.v = _mm256_setzero_pd(); + rho4v.v = _mm256_setzero_pd(); + + n_run = ( n ) / (n_elem_per_reg * n_iter_unroll); + n_left = ( n ) % (n_elem_per_reg * n_iter_unroll); + + + /* If y must be conjugated, we do so indirectly by first toggling the + effective conjugation of x and then conjugating the resulting dot + product. */ + conjx_use = conjx; + + if ((conjy == BLIS_CONJUGATE)) + { + conjx_use = (conjx_use ^ (0x1 << 4)); + } + + if (incx == 1 && incy == 1) + { + for (i = 0; i < n_run; ++i) + { + //(( dotxy )) += (( psi1[i] )) * (( chi1[i] )); + + // load the input + x1v.v = _mm256_loadu_pd((double*)chi1); + y1v.v = _mm256_loadu_pd((double*)psi1); + + y2v.v = _mm256_loadu_pd((double*)(psi1 + n_elem_per_reg)); + x2v.v = _mm256_loadu_pd((double*)(chi1 + n_elem_per_reg)); + + x3v.v = _mm256_loadu_pd((double*)(chi1 + 2 * n_elem_per_reg)); + y3v.v = _mm256_loadu_pd((double*)(psi1 + 2 * n_elem_per_reg)); + + x4v.v = _mm256_loadu_pd((double*)(chi1 + 3 * n_elem_per_reg)); + y4v.v = _mm256_loadu_pd((double*)(psi1 + 3 * n_elem_per_reg)); + + // Calculate the dot product + rho1v.v += y1v.v * x1v.v; + rho2v.v += y2v.v * x2v.v; + rho3v.v += y3v.v * x3v.v; + rho4v.v += y4v.v * x4v.v; + + chi1 += (n_elem_per_reg * n_iter_unroll); + psi1 += (n_elem_per_reg * n_iter_unroll); + } + + //accumulate the results + rho1v.v += rho2v.v; + rho3v.v += rho4v.v; + rho1v.v += rho3v.v; + + dotxy += rho1v.d[0] + rho1v.d[1] + rho1v.d[2] + rho1v.d[3]; + + if (n_left > 0) + { + for (i = 0; i < n_left; ++i) + { + dotxy += (*chi1) * (*psi1); + + chi1 += incx; + psi1 += incy; + } + } + } + else + { + for (i = 0; i < n; ++i) + { + dotxy += ((*psi1)) * ((*chi1)); + + chi1 += incx; + psi1 += incy; + } + } + + ((*rho)) += ((*alpha)) * ((dotxy)); + +}// End of function