From 58b5b77e5fdb179ea465e398e416e6a00d917e05 Mon Sep 17 00:00:00 2001 From: Kiran Varaganti Date: Wed, 8 Feb 2017 21:43:34 +0530 Subject: [PATCH] Fixed a bug in axpyv, the arguments passed to intrinsic fmad instruction are corrected Change-Id: If12f24c6bc74b22ac9e4acd6b9378e06d79f2f5e --- kernels/x86_64/zen/1/bli_axpyv_opt_var1.c | 72 +++++------------------ 1 file changed, 14 insertions(+), 58 deletions(-) diff --git a/kernels/x86_64/zen/1/bli_axpyv_opt_var1.c b/kernels/x86_64/zen/1/bli_axpyv_opt_var1.c index 073c40b09..998c2a6c1 100644 --- a/kernels/x86_64/zen/1/bli_axpyv_opt_var1.c +++ b/kernels/x86_64/zen/1/bli_axpyv_opt_var1.c @@ -174,7 +174,7 @@ void bli_daxpyv_opt_var1 typedef union { __m256 v; - float f[8]; + float f[8] __attribute__((aligned(64))); } v8ff_t; /* ! /brief Single precision axpyv function. @@ -517,18 +517,12 @@ void bli_saxpyv_opt_var4 ( y4v.v = _mm256_loadu_ps( ( float* )(y1 + 24)); y5v.v = _mm256_loadu_ps( ( float* )(y1 + 32)); -#if 0 - x1v.v = _mm256_fmadd_ps(x1v.v, y1v.v, alpha1v.v); // x1 = alpha * x1 + y1 - x2v.v = _mm256_fmadd_ps(x2v.v, y2v.v, alpha1v.v); - x3v.v = _mm256_fmadd_ps(x3v.v, y3v.v, alpha1v.v); - x4v.v = _mm256_fmadd_ps(x4v.v, y4v.v, alpha1v.v); - x5v.v = _mm256_fmadd_ps(x5v.v, y5v.v, alpha1v.v); -#endif - _mm256_fmadd_ps(x1v.v, y1v.v, alpha1v.v); // x1 = alpha * x1 + y1 - _mm256_fmadd_ps(x2v.v, y2v.v, alpha1v.v); - _mm256_fmadd_ps(x3v.v, y3v.v, alpha1v.v); - _mm256_fmadd_ps(x4v.v, y4v.v, alpha1v.v); - _mm256_fmadd_ps(x5v.v, y5v.v, alpha1v.v); + + x1v.v = _mm256_fmadd_ps(x1v.v, alpha1v.v, y1v.v); // x1 = alpha * x1 + y1 + x2v.v = _mm256_fmadd_ps(x2v.v, alpha1v.v, y2v.v); + x3v.v = _mm256_fmadd_ps(x3v.v, alpha1v.v, y3v.v); + x4v.v = _mm256_fmadd_ps(x4v.v, alpha1v.v, y4v.v); + x5v.v = _mm256_fmadd_ps(x5v.v, alpha1v.v, y5v.v); _mm256_storeu_ps( ( float* )(y1), x1v.v ); @@ -536,18 +530,6 @@ void bli_saxpyv_opt_var4 ( _mm256_storeu_ps( ( float* )(y1 + 16), x3v.v ); _mm256_storeu_ps( ( float* )(y1 + 24), x4v.v ); _mm256_storeu_ps( ( float* )(y1 + 32), x5v.v ); -#if 0 - y1v.v += alpha1v.v * x1v.v; - y2v.v += alpha1v.v * x2v.v; - y3v.v += alpha1v.v * x3v.v; - y4v.v += alpha1v.v * x4v.v; - y5v.v += alpha1v.v * x5v.v; - _mm256_storeu_ps( ( float* )(y1), y1v.v ); - _mm256_storeu_ps( ( float* )(y1 + 8), y2v.v ); - _mm256_storeu_ps( ( float* )(y1 + 16), y3v.v ); - _mm256_storeu_ps( ( float* )(y1 + 24), y4v.v ); - _mm256_storeu_ps( ( float* )(y1 + 32), y5v.v ); -#endif x1 += 40; y1 += 40; @@ -565,10 +547,10 @@ void bli_saxpyv_opt_var4 ( y3v.v = _mm256_loadu_ps( ( float* )(y1 + 16)); y4v.v = _mm256_loadu_ps( ( float* )(y1 + 24)); - _mm256_fmadd_ps(x1v.v, y1v.v, alpha1v.v); // x1v = alpha * x1v + y1v - _mm256_fmadd_ps(x2v.v, y2v.v, alpha1v.v); - _mm256_fmadd_ps(x3v.v, y3v.v, alpha1v.v); - _mm256_fmadd_ps(x4v.v, y4v.v, alpha1v.v); + x1v.v = _mm256_fmadd_ps(x1v.v, alpha1v.v, y1v.v); // x1v = alpha * x1v + y1v + x2v.v = _mm256_fmadd_ps(x2v.v, alpha1v.v, y2v.v); + x3v.v = _mm256_fmadd_ps(x3v.v, alpha1v.v, y3v.v); + x4v.v = _mm256_fmadd_ps(x4v.v, alpha1v.v, y4v.v); _mm256_storeu_ps( ( float* )(y1), x1v.v ); @@ -576,19 +558,6 @@ void bli_saxpyv_opt_var4 ( _mm256_storeu_ps( ( float* )(y1 + 16), x3v.v ); _mm256_storeu_ps( ( float* )(y1 + 24), x4v.v ); - -#if 0 - y1v.v += alpha1v.v * x1v.v; - y2v.v += alpha1v.v * x2v.v; - y3v.v += alpha1v.v * x3v.v; - y4v.v += alpha1v.v * x4v.v; - - _mm256_storeu_ps( ( float* )(y1), y1v.v ); - _mm256_storeu_ps( ( float* )(y1 + 8), y2v.v ); - _mm256_storeu_ps( ( float* )(y1 + 16), y3v.v ); - _mm256_storeu_ps( ( float* )(y1 + 24), y4v.v ); -#endif - x1 += 32; y1 += 32; } @@ -601,20 +570,12 @@ void bli_saxpyv_opt_var4 ( y1v.v = _mm256_loadu_ps( ( float* )y1 ); y2v.v = _mm256_loadu_ps( ( float* )(y1 + 8 ) ); - _mm256_fmadd_ps(x1v.v, y1v.v, alpha1v.v); // x1v = alpha * x1v + y1v - _mm256_fmadd_ps(x2v.v, y2v.v, alpha1v.v); + x1v.v = _mm256_fmadd_ps(x1v.v, alpha1v.v, y1v.v); // x1v = alpha * x1v + y1v + x2v.v = _mm256_fmadd_ps(x2v.v, alpha1v.v, y2v.v); _mm256_storeu_ps( ( float* )(y1), x1v.v ); _mm256_storeu_ps( ( float* )(y1 + 8), x2v.v ); -#if 0 - y1v.v += alpha1v.v * x1v.v; - y2v.v += alpha1v.v * x2v.v; - - _mm256_storeu_ps( ( float* )(y1), y1v.v ); - _mm256_storeu_ps( ( float* )(y1 + 8), y2v.v ); -#endif - x1 += 16; y1 += 16; } @@ -624,15 +585,10 @@ void bli_saxpyv_opt_var4 ( x1v.v = _mm256_loadu_ps( ( float* )x1 ); y1v.v = _mm256_loadu_ps( ( float* )y1 ); - _mm256_fmadd_ps(x1v.v, y1v.v, alpha1v.v); // x1v = alpha * x1v + y1v + x1v.v = _mm256_fmadd_ps(x1v.v, alpha1v.v, y1v.v); // x1v = alpha * x1v + y1v _mm256_storeu_ps( ( float* )(y1), x1v.v ); -#if 0 - y1v.v += alpha1v.v * x1v.v; - _mm256_storeu_ps( ( float* )(y1), y1v.v ); -#endif - x1 += 8; y1 += 8; }