From 4ec2bad74464abcfd05c457f67d4838db9cf0957 Mon Sep 17 00:00:00 2001
From: Vignesh Balasubramanian <vignbala@amd.com>
Date: Thu, 1 Aug 2024 16:59:22 +0530
Subject: [PATCH] Updating reduction step of AVX512 DNRM2 API

- Updated the final reduction of partial sums to use scalar accumulation
  entirely, instead of using the _mm512_reduce_add_pd( ... ) intrinsic.
  This will in turn change the associativity and the rounding-off
  pattern in the reduction step.

- Defined a union data-type to do the same, by having a 512-bit
  register and a double-precision array as its members.

- Updated the declaration and usage of the register variable according
  to the union definition, for uniformity.

AMD-Internal: [CPUPL-5472]
Change-Id: I997464a6ec47e4054dca48a000fbd4ac0cfcc679
---
 kernels/zen4/1/bli_norm2_zen_int_avx512.c | 379 +++++++++++-----------
 1 file changed, 195 insertions(+), 184 deletions(-)

diff --git a/kernels/zen4/1/bli_norm2_zen_int_avx512.c b/kernels/zen4/1/bli_norm2_zen_int_avx512.c
index 8a111cb65..14d0b72d7 100644
--- a/kernels/zen4/1/bli_norm2_zen_int_avx512.c
+++ b/kernels/zen4/1/bli_norm2_zen_int_avx512.c
@@ -34,6 +34,14 @@
 #include "immintrin.h"
 #include "blis.h"
 
+// Union data structure to access AVX registers
+// One 512-bit AVX register holds 8 DP elements. 
+typedef union
+{
+    __m512d v;
+    double  d[8] __attribute__( ( aligned( 64 ) ) );
+} v8df_t;
+
 /*
     Optimized kernel that computes the Frobenius norm using AVX512 intrinsics.
     The kernel takes in the following input parameters :
@@ -88,9 +96,9 @@ void bli_dnorm2fv_unb_var1_avx512
     {
         // AVX-512 code-section
         // Declaring registers for loading, accumulation, thresholds and scale factors
-        __m512d x_vec[4], sum_sml_vec[4], sum_med_vec[4], sum_big_vec[4], temp[4];
-        __m512d thresh_sml_vec, thresh_big_vec, scale_sml_vec, scale_big_vec;
-        __m512d zero_reg;
+        v8df_t x_vec[4], sum_sml_vec[4], sum_med_vec[4], sum_big_vec[4], temp[4];
+        v8df_t thresh_sml_vec, thresh_big_vec, scale_sml_vec, scale_big_vec;
+        v8df_t zero_reg;
 
         // Masks to be used in computation
         __mmask8 k_mask[8];
@@ -101,55 +109,55 @@ void bli_dnorm2fv_unb_var1_avx512
         unsigned char truth_val[4];
 
         // Setting the thresholds and scaling factors
-        thresh_sml_vec = _mm512_set1_pd( thresh_sml );
-        thresh_big_vec = _mm512_set1_pd( thresh_big );
-        scale_sml_vec = _mm512_set1_pd( scale_sml );
-        scale_big_vec = _mm512_set1_pd( scale_big );
+        thresh_sml_vec.v = _mm512_set1_pd( thresh_sml );
+        thresh_big_vec.v = _mm512_set1_pd( thresh_big );
+        scale_sml_vec.v = _mm512_set1_pd( scale_sml );
+        scale_big_vec.v = _mm512_set1_pd( scale_big );
 
         // Resetting the accumulators
-        sum_sml_vec[0] = _mm512_setzero_pd();
-        sum_sml_vec[1] = _mm512_setzero_pd();
-        sum_sml_vec[2] = _mm512_setzero_pd();
-        sum_sml_vec[3] = _mm512_setzero_pd();
+        sum_sml_vec[0].v = _mm512_setzero_pd();
+        sum_sml_vec[1].v = _mm512_setzero_pd();
+        sum_sml_vec[2].v = _mm512_setzero_pd();
+        sum_sml_vec[3].v = _mm512_setzero_pd();
 
-        sum_med_vec[0] = _mm512_setzero_pd();
-        sum_med_vec[1] = _mm512_setzero_pd();
-        sum_med_vec[2] = _mm512_setzero_pd();
-        sum_med_vec[3] = _mm512_setzero_pd();
+        sum_med_vec[0].v = _mm512_setzero_pd();
+        sum_med_vec[1].v = _mm512_setzero_pd();
+        sum_med_vec[2].v = _mm512_setzero_pd();
+        sum_med_vec[3].v = _mm512_setzero_pd();
 
-        sum_big_vec[0] = _mm512_setzero_pd();
-        sum_big_vec[1] = _mm512_setzero_pd();
-        sum_big_vec[2] = _mm512_setzero_pd();
-        sum_big_vec[3] = _mm512_setzero_pd();
+        sum_big_vec[0].v = _mm512_setzero_pd();
+        sum_big_vec[1].v = _mm512_setzero_pd();
+        sum_big_vec[2].v = _mm512_setzero_pd();
+        sum_big_vec[3].v = _mm512_setzero_pd();
 
-        zero_reg = _mm512_setzero_pd();
+        zero_reg.v = _mm512_setzero_pd();
 
         // Computing in blocks of 32
         for ( ; ( i + 32 ) <= n; i = i + 32 )
         {
             // Set temp[0..3] to zero
-            temp[0] = _mm512_setzero_pd();
-            temp[1] = _mm512_setzero_pd();
-            temp[2] = _mm512_setzero_pd();
-            temp[3] = _mm512_setzero_pd();
+            temp[0].v = _mm512_setzero_pd();
+            temp[1].v = _mm512_setzero_pd();
+            temp[2].v = _mm512_setzero_pd();
+            temp[3].v = _mm512_setzero_pd();
 
             // Loading the vectors
-            x_vec[0] = _mm512_loadu_pd( xt );
-            x_vec[1] = _mm512_loadu_pd( xt + 8 );
-            x_vec[2] = _mm512_loadu_pd( xt + 16 );
-            x_vec[3] = _mm512_loadu_pd( xt + 24 );
+            x_vec[0].v = _mm512_loadu_pd( xt );
+            x_vec[1].v = _mm512_loadu_pd( xt + 8 );
+            x_vec[2].v = _mm512_loadu_pd( xt + 16 );
+            x_vec[3].v = _mm512_loadu_pd( xt + 24 );
 
             // Comparing to check for NaN
             // Bits in the mask are set if NaN is encountered
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], x_vec[0], _CMP_UNORD_Q );
-            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1], x_vec[1], _CMP_UNORD_Q );
-            k_mask[2] = _mm512_cmp_pd_mask( x_vec[2], x_vec[2], _CMP_UNORD_Q );
-            k_mask[3] = _mm512_cmp_pd_mask( x_vec[3], x_vec[3], _CMP_UNORD_Q );
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, x_vec[0].v, _CMP_UNORD_Q );
+            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1].v, x_vec[1].v, _CMP_UNORD_Q );
+            k_mask[2] = _mm512_cmp_pd_mask( x_vec[2].v, x_vec[2].v, _CMP_UNORD_Q );
+            k_mask[3] = _mm512_cmp_pd_mask( x_vec[3].v, x_vec[3].v, _CMP_UNORD_Q );
 
             // Checking if any bit in the masks are set
             // The truth_val is set to 0 if any bit in the mask is 1
-            // Thus, truth_val[0] = 0 if x_vec[0] or x_vec[1] has NaN
-            //       truth_val[1] = 0 if x_vec[2] or x_vec[3] has NaN
+            // Thus, truth_val[0] = 0 if x_vec[0].v or x_vec[1].v has NaN
+            //       truth_val[1] = 0 if x_vec[2].v or x_vec[3].v has NaN
             truth_val[0] = _kortestz_mask8_u8( k_mask[0], k_mask[1] );
             truth_val[1] = _kortestz_mask8_u8( k_mask[2], k_mask[3] );
 
@@ -163,30 +171,30 @@ void bli_dnorm2fv_unb_var1_avx512
             }
 
             // Getting the absoulte values of elements in the vectors
-            x_vec[0] = _mm512_abs_pd( x_vec[0] );
-            x_vec[1] = _mm512_abs_pd( x_vec[1] );
-            x_vec[2] = _mm512_abs_pd( x_vec[2] );
-            x_vec[3] = _mm512_abs_pd( x_vec[3] );
+            x_vec[0].v = _mm512_abs_pd( x_vec[0].v );
+            x_vec[1].v = _mm512_abs_pd( x_vec[1].v );
+            x_vec[2].v = _mm512_abs_pd( x_vec[2].v );
+            x_vec[3].v = _mm512_abs_pd( x_vec[3].v );
 
-            // Setting the masks by comparing with thresh_sml_vec
-            // That is, k_mask[0][i] = 1 if x_vec[0][i] > thresh_sml_vec
-            //          k_mask[1][i] = 1 if x_vec[1][i] > thresh_sml_vec
-            //          k_mask[2][i] = 1 if x_vec[2][i] > thresh_sml_vec
-            //          k_mask[3][i] = 1 if x_vec[3][i] > thresh_sml_vec
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], thresh_sml_vec, _CMP_GT_OS );
-            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1], thresh_sml_vec, _CMP_GT_OS );
-            k_mask[2] = _mm512_cmp_pd_mask( x_vec[2], thresh_sml_vec, _CMP_GT_OS );
-            k_mask[3] = _mm512_cmp_pd_mask( x_vec[3], thresh_sml_vec, _CMP_GT_OS );
+            // Setting the masks by comparing with thresh_sml_vec.v
+            // That is, k_mask[0][i] = 1 if x_vec[0].v[i] > thresh_sml_vec.v
+            //          k_mask[1][i] = 1 if x_vec[1].v[i] > thresh_sml_vec.v
+            //          k_mask[2][i] = 1 if x_vec[2].v[i] > thresh_sml_vec.v
+            //          k_mask[3][i] = 1 if x_vec[3].v[i] > thresh_sml_vec.v
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_sml_vec.v, _CMP_GT_OS );
+            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1].v, thresh_sml_vec.v, _CMP_GT_OS );
+            k_mask[2] = _mm512_cmp_pd_mask( x_vec[2].v, thresh_sml_vec.v, _CMP_GT_OS );
+            k_mask[3] = _mm512_cmp_pd_mask( x_vec[3].v, thresh_sml_vec.v, _CMP_GT_OS );
 
-            // Setting the masks by comparing with thresh_big_vec
-            // That is, k_mask[4][i] = 1 if x_vec[0][i] < thresh_big_vec
-            //          k_mask[5][i] = 1 if x_vec[1][i] < thresh_big_vec
-            //          k_mask[6][i] = 1 if x_vec[2][i] < thresh_big_vec
-            //          k_mask[7][i] = 1 if x_vec[3][i] < thresh_big_vec
-            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0], thresh_big_vec, _CMP_LT_OS );
-            k_mask[5] = _mm512_cmp_pd_mask( x_vec[1], thresh_big_vec, _CMP_LT_OS );
-            k_mask[6] = _mm512_cmp_pd_mask( x_vec[2], thresh_big_vec, _CMP_LT_OS );
-            k_mask[7] = _mm512_cmp_pd_mask( x_vec[3], thresh_big_vec, _CMP_LT_OS );
+            // Setting the masks by comparing with thresh_big_vec.v
+            // That is, k_mask[4][i] = 1 if x_vec[0].v[i] < thresh_big_vec.v
+            //          k_mask[5][i] = 1 if x_vec[1].v[i] < thresh_big_vec.v
+            //          k_mask[6][i] = 1 if x_vec[2].v[i] < thresh_big_vec.v
+            //          k_mask[7][i] = 1 if x_vec[3].v[i] < thresh_big_vec.v
+            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_big_vec.v, _CMP_LT_OS );
+            k_mask[5] = _mm512_cmp_pd_mask( x_vec[1].v, thresh_big_vec.v, _CMP_LT_OS );
+            k_mask[6] = _mm512_cmp_pd_mask( x_vec[2].v, thresh_big_vec.v, _CMP_LT_OS );
+            k_mask[7] = _mm512_cmp_pd_mask( x_vec[3].v, thresh_big_vec.v, _CMP_LT_OS );
 
             // Setting the masks to filter only the elements within the thresholds
             // k_mask[0 ... 3] contain masks for elements > thresh_sml
@@ -200,10 +208,10 @@ void bli_dnorm2fv_unb_var1_avx512
             // Setting booleans to check for underflow/overflow handling
             // In case of having values outside threshold, the associated
             // bit in k_mask[4 ... 7] is 0.
-            // Thus, truth_val[0] = 0 if x_vec[0] has elements outside thresholds
-            //       truth_val[1] = 0 if x_vec[1] has elements outside thresholds
-            //       truth_val[2] = 0 if x_vec[2] has elements outside thresholds
-            //       truth_val[3] = 0 if x_vec[3] has elements outside thresholds
+            // Thus, truth_val[0] = 0 if x_vec[0].v has elements outside thresholds
+            //       truth_val[1] = 0 if x_vec[1].v has elements outside thresholds
+            //       truth_val[2] = 0 if x_vec[2].v has elements outside thresholds
+            //       truth_val[3] = 0 if x_vec[3].v has elements outside thresholds
             truth_val[0] = _kortestc_mask8_u8( k_mask[4], k_mask[4] );
             truth_val[1] = _kortestc_mask8_u8( k_mask[5], k_mask[5] );
             truth_val[2] = _kortestc_mask8_u8( k_mask[6], k_mask[6] );
@@ -211,10 +219,10 @@ void bli_dnorm2fv_unb_var1_avx512
 
             // Computing using masked fmadds, that carries over values from
             // accumulator register if the mask bit is 0
-            sum_med_vec[0] = _mm512_mask3_fmadd_pd( x_vec[0], x_vec[0], sum_med_vec[0], k_mask[4] );
-            sum_med_vec[1] = _mm512_mask3_fmadd_pd( x_vec[1], x_vec[1], sum_med_vec[1], k_mask[5] );
-            sum_med_vec[2] = _mm512_mask3_fmadd_pd( x_vec[2], x_vec[2], sum_med_vec[2], k_mask[6] );
-            sum_med_vec[3] = _mm512_mask3_fmadd_pd( x_vec[3], x_vec[3], sum_med_vec[3], k_mask[7] );
+            sum_med_vec[0].v = _mm512_mask3_fmadd_pd( x_vec[0].v, x_vec[0].v, sum_med_vec[0].v, k_mask[4] );
+            sum_med_vec[1].v = _mm512_mask3_fmadd_pd( x_vec[1].v, x_vec[1].v, sum_med_vec[1].v, k_mask[5] );
+            sum_med_vec[2].v = _mm512_mask3_fmadd_pd( x_vec[2].v, x_vec[2].v, sum_med_vec[2].v, k_mask[6] );
+            sum_med_vec[3].v = _mm512_mask3_fmadd_pd( x_vec[3].v, x_vec[3].v, sum_med_vec[3].v, k_mask[7] );
 
             // In case of having elements outside the threshold
             if( !( truth_val[0] && truth_val[1] && truth_val[2] && truth_val[3] ) )
@@ -224,20 +232,20 @@ void bli_dnorm2fv_unb_var1_avx512
                 // k_mask[0 ... 3] contain masks for elements > thresh_sml. This would
                 // include both elements < thresh_big and >= thresh_big
                 // XOR on these will produce masks for elements >= thresh_big
-                // That is, k_mask[4][i] = 1 if x_vec[0][i] >= thresh_big_vec
-                //          k_mask[5][i] = 1 if x_vec[1][i] >= thresh_big_vec
-                //          k_mask[6][i] = 1 if x_vec[2][i] >= thresh_big_vec
-                //          k_mask[7][i] = 1 if x_vec[3][i] >= thresh_big_vec
+                // That is, k_mask[4][i] = 1 if x_vec[0].v[i] >= thresh_big_vec.v
+                //          k_mask[5][i] = 1 if x_vec[1].v[i] >= thresh_big_vec.v
+                //          k_mask[6][i] = 1 if x_vec[2].v[i] >= thresh_big_vec.v
+                //          k_mask[7][i] = 1 if x_vec[3].v[i] >= thresh_big_vec.v
                 k_mask[4] = _kxor_mask8( k_mask[0], k_mask[4] );
                 k_mask[5] = _kxor_mask8( k_mask[1], k_mask[5] );
                 k_mask[6] = _kxor_mask8( k_mask[2], k_mask[6] );
                 k_mask[7] = _kxor_mask8( k_mask[3], k_mask[7] );
 
                 // Inverting k_mask[0 ... 3], to obtain masks for elements <= thresh_sml
-                // That is, k_mask[0][i] = 1 if x_vec[0][i] <= thresh_sml_vec
-                //          k_mask[1][i] = 1 if x_vec[1][i] <= thresh_sml_vec
-                //          k_mask[2][i] = 1 if x_vec[2][i] <= thresh_sml_vec
-                //          k_mask[3][i] = 1 if x_vec[3][i] <= thresh_sml_vec
+                // That is, k_mask[0][i] = 1 if x_vec[0].v[i] <= thresh_sml_vec.v
+                //          k_mask[1][i] = 1 if x_vec[1].v[i] <= thresh_sml_vec.v
+                //          k_mask[2][i] = 1 if x_vec[2].v[i] <= thresh_sml_vec.v
+                //          k_mask[3][i] = 1 if x_vec[3].v[i] <= thresh_sml_vec.v
                 k_mask[0] = _knot_mask8( k_mask[0] );
                 k_mask[1] = _knot_mask8( k_mask[1] );
                 k_mask[2] = _knot_mask8( k_mask[2] );
@@ -245,8 +253,8 @@ void bli_dnorm2fv_unb_var1_avx512
 
                 // Checking whether we have values greater than thresh_big
                 // The truth_val is set to 0 if any bit in the mask is 1
-                // Thus, truth_val[2] = 0 if x_vec[0] or x_vec[1] has elements >= thresh_big_vec
-                //       truth_val[3] = 0 if x_vec[2] or x_vec[3] has elements >= thresh_big_vec
+                // Thus, truth_val[2] = 0 if x_vec[0].v or x_vec[1].v has elements >= thresh_big_vec.v
+                //       truth_val[3] = 0 if x_vec[2].v or x_vec[3].v has elements >= thresh_big_vec.v
                 truth_val[2] = _kortestz_mask8_u8( k_mask[4], k_mask[5] );
                 truth_val[3] = _kortestz_mask8_u8( k_mask[6], k_mask[7] );
 
@@ -261,16 +269,16 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are greater than thresh_big
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[4], scale_big_vec, x_vec[0] );
-                    temp[1] = _mm512_mask_mul_pd( zero_reg, k_mask[5], scale_big_vec, x_vec[1] );
-                    temp[2] = _mm512_mask_mul_pd( zero_reg, k_mask[6], scale_big_vec, x_vec[2] );
-                    temp[3] = _mm512_mask_mul_pd( zero_reg, k_mask[7], scale_big_vec, x_vec[3] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[4], scale_big_vec.v, x_vec[0].v );
+                    temp[1].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[5], scale_big_vec.v, x_vec[1].v );
+                    temp[2].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[6], scale_big_vec.v, x_vec[2].v );
+                    temp[3].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[7], scale_big_vec.v, x_vec[3].v );
 
                     // Square and add the elements to the accumulators
-                    sum_big_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_big_vec[0] );
-                    sum_big_vec[1] = _mm512_fmadd_pd( temp[1], temp[1], sum_big_vec[1] );
-                    sum_big_vec[2] = _mm512_fmadd_pd( temp[2], temp[2], sum_big_vec[2] );
-                    sum_big_vec[3] = _mm512_fmadd_pd( temp[3], temp[3], sum_big_vec[3] );
+                    sum_big_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_big_vec[0].v );
+                    sum_big_vec[1].v = _mm512_fmadd_pd( temp[1].v, temp[1].v, sum_big_vec[1].v );
+                    sum_big_vec[2].v = _mm512_fmadd_pd( temp[2].v, temp[2].v, sum_big_vec[2].v );
+                    sum_big_vec[3].v = _mm512_fmadd_pd( temp[3].v, temp[3].v, sum_big_vec[3].v );
                 }
                 else if( !isbig )
                 {
@@ -279,16 +287,16 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are lesser than thresh_sml, if needed
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[0], scale_sml_vec, x_vec[0] );
-                    temp[1] = _mm512_mask_mul_pd( zero_reg, k_mask[1], scale_sml_vec, x_vec[1] );
-                    temp[2] = _mm512_mask_mul_pd( zero_reg, k_mask[2], scale_sml_vec, x_vec[2] );
-                    temp[3] = _mm512_mask_mul_pd( zero_reg, k_mask[3], scale_sml_vec, x_vec[3] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[0], scale_sml_vec.v, x_vec[0].v );
+                    temp[1].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[1], scale_sml_vec.v, x_vec[1].v );
+                    temp[2].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[2], scale_sml_vec.v, x_vec[2].v );
+                    temp[3].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[3], scale_sml_vec.v, x_vec[3].v );
 
                     // Square and add the elements to the accumulators
-                    sum_sml_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_sml_vec[0] );
-                    sum_sml_vec[1] = _mm512_fmadd_pd( temp[1], temp[1], sum_sml_vec[1] );
-                    sum_sml_vec[2] = _mm512_fmadd_pd( temp[2], temp[2], sum_sml_vec[2] );
-                    sum_sml_vec[3] = _mm512_fmadd_pd( temp[3], temp[3], sum_sml_vec[3] );
+                    sum_sml_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_sml_vec[0].v );
+                    sum_sml_vec[1].v = _mm512_fmadd_pd( temp[1].v, temp[1].v, sum_sml_vec[1].v );
+                    sum_sml_vec[2].v = _mm512_fmadd_pd( temp[2].v, temp[2].v, sum_sml_vec[2].v );
+                    sum_sml_vec[3].v = _mm512_fmadd_pd( temp[3].v, temp[3].v, sum_sml_vec[3].v );
                 }
             }
 
@@ -300,21 +308,21 @@ void bli_dnorm2fv_unb_var1_avx512
         for ( ; ( i + 16 ) <= n; i = i + 16 )
         {
             // Set temp[0..1] to zero
-            temp[0] = _mm512_setzero_pd();
-            temp[1] = _mm512_setzero_pd();
+            temp[0].v = _mm512_setzero_pd();
+            temp[1].v = _mm512_setzero_pd();
 
             // Loading the vectors
-            x_vec[0] = _mm512_loadu_pd( xt );
-            x_vec[1] = _mm512_loadu_pd( xt + 8 );
+            x_vec[0].v = _mm512_loadu_pd( xt );
+            x_vec[1].v = _mm512_loadu_pd( xt + 8 );
 
             // Comparing to check for NaN
             // Bits in the mask are set if NaN is encountered
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], x_vec[0], _CMP_UNORD_Q );
-            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1], x_vec[1], _CMP_UNORD_Q );
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, x_vec[0].v, _CMP_UNORD_Q );
+            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1].v, x_vec[1].v, _CMP_UNORD_Q );
 
             // Checking if any bit in the masks are set
             // The truth_val is set to 0 if any bit in the mask is 1
-            // Thus, truth_val[0] = 0 if x_vec[0] or x_vec[1] has NaN
+            // Thus, truth_val[0] = 0 if x_vec[0].v or x_vec[1].v has NaN
             truth_val[0] = _kortestz_mask8_u8( k_mask[0], k_mask[1] );
 
             // Set norm to NaN and return early, if either truth_val[0] or truth_val[1] is set to 0
@@ -327,20 +335,20 @@ void bli_dnorm2fv_unb_var1_avx512
             }
 
             // Getting the absoulte values of elements in the vectors
-            x_vec[0] = _mm512_abs_pd( x_vec[0] );
-            x_vec[1] = _mm512_abs_pd( x_vec[1] );
+            x_vec[0].v = _mm512_abs_pd( x_vec[0].v );
+            x_vec[1].v = _mm512_abs_pd( x_vec[1].v );
 
-            // Setting the masks by comparing with thresh_sml_vec
-            // That is, k_mask[0][i] = 1 if x_vec[0][i] > thresh_sml_vec
-            //          k_mask[1][i] = 1 if x_vec[1][i] > thresh_sml_vec
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], thresh_sml_vec, _CMP_GT_OS );
-            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1], thresh_sml_vec, _CMP_GT_OS );
+            // Setting the masks by comparing with thresh_sml_vec.v
+            // That is, k_mask[0][i] = 1 if x_vec[0].v[i] > thresh_sml_vec.v
+            //          k_mask[1][i] = 1 if x_vec[1].v[i] > thresh_sml_vec.v
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_sml_vec.v, _CMP_GT_OS );
+            k_mask[1] = _mm512_cmp_pd_mask( x_vec[1].v, thresh_sml_vec.v, _CMP_GT_OS );
 
-            // Setting the masks by comparing with thresh_big_vec
-            // That is, k_mask[4][i] = 1 if x_vec[0][i] < thresh_big_vec
-            //          k_mask[5][i] = 1 if x_vec[1][i] < thresh_big_vec
-            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0], thresh_big_vec, _CMP_LT_OS );
-            k_mask[5] = _mm512_cmp_pd_mask( x_vec[1], thresh_big_vec, _CMP_LT_OS );
+            // Setting the masks by comparing with thresh_big_vec.v
+            // That is, k_mask[4][i] = 1 if x_vec[0].v[i] < thresh_big_vec.v
+            //          k_mask[5][i] = 1 if x_vec[1].v[i] < thresh_big_vec.v
+            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_big_vec.v, _CMP_LT_OS );
+            k_mask[5] = _mm512_cmp_pd_mask( x_vec[1].v, thresh_big_vec.v, _CMP_LT_OS );
 
             // Setting the masks to filter only the elements within the thresholds
             // k_mask[0 ... 1] contain masks for elements > thresh_sml
@@ -352,15 +360,15 @@ void bli_dnorm2fv_unb_var1_avx512
             // Setting booleans to check for underflow/overflow handling
             // In case of having values outside threshold, the associated
             // bit in k_mask[4 ... 7] is 0.
-            // Thus, truth_val[0] = 0 if x_vec[0] has elements outside thresholds
-            //       truth_val[1] = 0 if x_vec[1] has elements outside thresholds
+            // Thus, truth_val[0] = 0 if x_vec[0].v has elements outside thresholds
+            //       truth_val[1] = 0 if x_vec[1].v has elements outside thresholds
             truth_val[0] = _kortestc_mask8_u8( k_mask[4], k_mask[4] );
             truth_val[1] = _kortestc_mask8_u8( k_mask[5], k_mask[5] );
 
             // Computing using masked fmadds, that carries over values from
             // accumulator register if the mask bit is 0
-            sum_med_vec[0] = _mm512_mask3_fmadd_pd( x_vec[0], x_vec[0], sum_med_vec[0], k_mask[4] );
-            sum_med_vec[1] = _mm512_mask3_fmadd_pd( x_vec[1], x_vec[1], sum_med_vec[1], k_mask[5] );
+            sum_med_vec[0].v = _mm512_mask3_fmadd_pd( x_vec[0].v, x_vec[0].v, sum_med_vec[0].v, k_mask[4] );
+            sum_med_vec[1].v = _mm512_mask3_fmadd_pd( x_vec[1].v, x_vec[1].v, sum_med_vec[1].v, k_mask[5] );
 
             // In case of having elements outside the threshold
             if( !( truth_val[0] && truth_val[1] ) )
@@ -370,20 +378,20 @@ void bli_dnorm2fv_unb_var1_avx512
                 // k_mask[0 ... 1] contain masks for elements > thresh_sml. This would
                 // include both elements < thresh_big and >= thresh_big
                 // XOR on these will produce masks for elements >= thresh_big
-                // That is, k_mask[4][i] = 1 if x_vec[0][i] >= thresh_big_vec
-                //          k_mask[5][i] = 1 if x_vec[1][i] >= thresh_big_vec
+                // That is, k_mask[4][i] = 1 if x_vec[0].v[i] >= thresh_big_vec.v
+                //          k_mask[5][i] = 1 if x_vec[1].v[i] >= thresh_big_vec.v
                 k_mask[4] = _kxor_mask8( k_mask[0], k_mask[4] );
                 k_mask[5] = _kxor_mask8( k_mask[1], k_mask[5] );
 
                 // Inverting k_mask[0 ... 1], to obtain masks for elements <= thresh_sml
-                // That is, k_mask[0][i] = 1 if x_vec[0][i] <= thresh_sml_vec
-                //          k_mask[1][i] = 1 if x_vec[1][i] <= thresh_sml_vec
+                // That is, k_mask[0][i] = 1 if x_vec[0].v[i] <= thresh_sml_vec.v
+                //          k_mask[1][i] = 1 if x_vec[1].v[i] <= thresh_sml_vec.v
                 k_mask[0] = _knot_mask8( k_mask[0] );
                 k_mask[1] = _knot_mask8( k_mask[1] );
 
                 // Checking whether we have values greater than thresh_big
                 // The truth_val is set to 0 if any bit in the mask is 1
-                // Thus, truth_val[2] = 0 if x_vec[0] or x_vec[1] has elements >= thresh_big_vec
+                // Thus, truth_val[2] = 0 if x_vec[0].v or x_vec[1].v has elements >= thresh_big_vec.v
                 truth_val[2] = _kortestz_mask8_u8( k_mask[4], k_mask[5] );
 
                 // In case of having values greater than thresh_big
@@ -397,12 +405,12 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are greater than thresh_big
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[4], scale_big_vec, x_vec[0] );
-                    temp[1] = _mm512_mask_mul_pd( zero_reg, k_mask[5], scale_big_vec, x_vec[1] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[4], scale_big_vec.v, x_vec[0].v );
+                    temp[1].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[5], scale_big_vec.v, x_vec[1].v );
 
                     // Square and add the elements to the accumulators
-                    sum_big_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_big_vec[0] );
-                    sum_big_vec[1] = _mm512_fmadd_pd( temp[1], temp[1], sum_big_vec[1] );
+                    sum_big_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_big_vec[0].v );
+                    sum_big_vec[1].v = _mm512_fmadd_pd( temp[1].v, temp[1].v, sum_big_vec[1].v );
                 }
                 else if( !isbig )
                 {
@@ -411,12 +419,12 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are lesser than thresh_sml, if needed
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[0], scale_sml_vec, x_vec[0] );
-                    temp[1] = _mm512_mask_mul_pd( zero_reg, k_mask[1], scale_sml_vec, x_vec[1] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[0], scale_sml_vec.v, x_vec[0].v );
+                    temp[1].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[1], scale_sml_vec.v, x_vec[1].v );
 
                     // Square and add the elements to the accumulators
-                    sum_sml_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_sml_vec[0] );
-                    sum_sml_vec[1] = _mm512_fmadd_pd( temp[1], temp[1], sum_sml_vec[1] );
+                    sum_sml_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_sml_vec[0].v );
+                    sum_sml_vec[1].v = _mm512_fmadd_pd( temp[1].v, temp[1].v, sum_sml_vec[1].v );
                 }
             }
 
@@ -425,19 +433,19 @@ void bli_dnorm2fv_unb_var1_avx512
         }
         for ( ; ( i + 8 ) <= n; i = i + 8 )
         {
-            // Set temp[0] to zero
-            temp[0] = _mm512_setzero_pd();
+            // Set temp[0].v to zero
+            temp[0].v = _mm512_setzero_pd();
 
             // Loading the vectors
-            x_vec[0] = _mm512_loadu_pd( xt );
+            x_vec[0].v = _mm512_loadu_pd( xt );
 
             // Comparing to check for NaN
             // Bits in the mask are set if NaN is encountered
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], x_vec[0], _CMP_UNORD_Q );
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, x_vec[0].v, _CMP_UNORD_Q );
 
             // Checking if any bit in the masks are set
             // The truth_val is set to 0 if any bit in the mask is 1
-            // Thus, truth_val[0] = 0 if x_vec[0] or x_vec[1] has NaN
+            // Thus, truth_val[0] = 0 if x_vec[0].v or x_vec[1].v has NaN
             truth_val[0] = _kortestz_mask8_u8( k_mask[0], k_mask[0] );
 
             // Set norm to NaN and return early, if either truth_val[0] or truth_val[1] is set to 0
@@ -450,15 +458,15 @@ void bli_dnorm2fv_unb_var1_avx512
             }
 
             // Getting the absoulte values of elements in the vectors
-            x_vec[0] = _mm512_abs_pd( x_vec[0] );
+            x_vec[0].v = _mm512_abs_pd( x_vec[0].v );
 
-            // Setting the masks by comparing with thresh_sml_vec
-            // That is, k_mask[0][i] = 1 if x_vec[0][i] > thresh_sml_vec
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], thresh_sml_vec, _CMP_GT_OS );
+            // Setting the masks by comparing with thresh_sml_vec.v
+            // That is, k_mask[0][i] = 1 if x_vec[0].v[i] > thresh_sml_vec.v
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_sml_vec.v, _CMP_GT_OS );
 
-            // Setting the masks by comparing with thresh_big_vec
-            // That is, k_mask[4][i] = 1 if x_vec[0][i] < thresh_big_vec
-            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0], thresh_big_vec, _CMP_LT_OS );
+            // Setting the masks by comparing with thresh_big_vec.v
+            // That is, k_mask[4][i] = 1 if x_vec[0].v[i] < thresh_big_vec.v
+            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_big_vec.v, _CMP_LT_OS );
 
             // Setting the masks to filter only the elements within the thresholds
             // k_mask[0] contain masks for elements > thresh_sml
@@ -469,12 +477,12 @@ void bli_dnorm2fv_unb_var1_avx512
             // Setting booleans to check for underflow/overflow handling
             // In case of having values outside threshold, the associated
             // bit in k_mask[4] is 0.
-            // Thus, truth_val[0] = 0 if x_vec[0] has elements outside thresholds
+            // Thus, truth_val[0] = 0 if x_vec[0].v has elements outside thresholds
             truth_val[0] = _kortestc_mask8_u8( k_mask[4], k_mask[4] );
 
             // Computing using masked fmadds, that carries over values from
             // accumulator register if the mask bit is 0
-            sum_med_vec[0] = _mm512_mask3_fmadd_pd( x_vec[0], x_vec[0], sum_med_vec[0], k_mask[4] );
+            sum_med_vec[0].v = _mm512_mask3_fmadd_pd( x_vec[0].v, x_vec[0].v, sum_med_vec[0].v, k_mask[4] );
 
             // In case of having elements outside the threshold
             if( !truth_val[0] )
@@ -484,18 +492,18 @@ void bli_dnorm2fv_unb_var1_avx512
                 // k_mask[0 ... 1] contain masks for elements > thresh_sml. This would
                 // include both elements < thresh_big and >= thresh_big
                 // XOR on these will produce masks for elements >= thresh_big
-                // That is, k_mask[4][i] = 1 if x_vec[0][i] >= thresh_big_vec
-                //          k_mask[5][i] = 1 if x_vec[1][i] >= thresh_big_vec
+                // That is, k_mask[4][i] = 1 if x_vec[0].v[i] >= thresh_big_vec.v
+                //          k_mask[5][i] = 1 if x_vec[1].v[i] >= thresh_big_vec.v
                 k_mask[4] = _kxor_mask8( k_mask[0], k_mask[4] );
 
                 // Inverting k_mask[0 ... 1], to obtain masks for elements <= thresh_sml
-                // That is, k_mask[0][i] = 1 if x_vec[0][i] <= thresh_sml_vec
-                //          k_mask[1][i] = 1 if x_vec[1][i] <= thresh_sml_vec
+                // That is, k_mask[0][i] = 1 if x_vec[0].v[i] <= thresh_sml_vec.v
+                //          k_mask[1][i] = 1 if x_vec[1].v[i] <= thresh_sml_vec.v
                 k_mask[0] = _knot_mask8( k_mask[0] );
 
                 // Checking whether we have values greater than thresh_big
                 // The truth_val is set to 0 if any bit in the mask is 1
-                // Thus, truth_val[2] = 0 if x_vec[0] or x_vec[1] has elements >= thresh_big_vec
+                // Thus, truth_val[2] = 0 if x_vec[0].v or x_vec[1].v has elements >= thresh_big_vec.v
                 truth_val[2] = _kortestz_mask8_u8( k_mask[4], k_mask[4] );
 
                 // In case of having values greater than thresh_big
@@ -509,10 +517,10 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are greater than thresh_big
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[4], scale_big_vec, x_vec[0] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[4], scale_big_vec.v, x_vec[0].v );
 
                     // Square and add the elements to the accumulators
-                    sum_big_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_big_vec[0] );
+                    sum_big_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_big_vec[0].v );
                 }
                 else if( !isbig )
                 {
@@ -521,10 +529,10 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are lesser than thresh_sml, if needed
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[0], scale_sml_vec, x_vec[0] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[0], scale_sml_vec.v, x_vec[0].v );
 
                     // Square and add the elements to the accumulators
-                    sum_sml_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_sml_vec[0] );
+                    sum_sml_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_sml_vec[0].v );
                 }
             }
 
@@ -533,22 +541,22 @@ void bli_dnorm2fv_unb_var1_avx512
         }
         if( i < n )
         {
-            // Set temp[0] to zero
-            temp[0] = _mm512_setzero_pd();
+            // Set temp[0].v to zero
+            temp[0].v = _mm512_setzero_pd();
 
             // Setting the mask to load
             k_mask[0] = ( 1 << ( n - i ) ) - 1;
 
             // Loading the vectors
-            x_vec[0] = _mm512_maskz_loadu_pd( k_mask[0], xt );
+            x_vec[0].v = _mm512_maskz_loadu_pd( k_mask[0], xt );
 
             // Comparing to check for NaN
             // Bits in the mask are set if NaN is encountered
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], x_vec[0], _CMP_UNORD_Q );
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, x_vec[0].v, _CMP_UNORD_Q );
 
             // Checking if any bit in the masks are set
             // The truth_val is set to 0 if any bit in the mask is 1
-            // Thus, truth_val[0] = 0 if x_vec[0] or x_vec[1] has NaN
+            // Thus, truth_val[0] = 0 if x_vec[0].v or x_vec[1].v has NaN
             truth_val[0] = _kortestz_mask8_u8( k_mask[0], k_mask[0] );
 
             // Set norm to NaN and return early, if either truth_val[0] or truth_val[1] is set to 0
@@ -561,15 +569,15 @@ void bli_dnorm2fv_unb_var1_avx512
             }
 
             // Getting the absoulte values of elements in the vectors
-            x_vec[0] = _mm512_abs_pd( x_vec[0] );
+            x_vec[0].v = _mm512_abs_pd( x_vec[0].v );
 
-            // Setting the masks by comparing with thresh_sml_vec
-            // That is, k_mask[0][i] = 1 if x_vec[0][i] > thresh_sml_vec
-            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0], thresh_sml_vec, _CMP_GT_OS );
+            // Setting the masks by comparing with thresh_sml_vec.v
+            // That is, k_mask[0][i] = 1 if x_vec[0].v[i] > thresh_sml_vec.v
+            k_mask[0] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_sml_vec.v, _CMP_GT_OS );
 
-            // Setting the masks by comparing with thresh_big_vec
-            // That is, k_mask[4][i] = 1 if x_vec[0][i] < thresh_big_vec
-            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0], thresh_big_vec, _CMP_LT_OS );
+            // Setting the masks by comparing with thresh_big_vec.v
+            // That is, k_mask[4][i] = 1 if x_vec[0].v[i] < thresh_big_vec.v
+            k_mask[4] = _mm512_cmp_pd_mask( x_vec[0].v, thresh_big_vec.v, _CMP_LT_OS );
 
             // Setting the masks to filter only the elements within the thresholds
             // k_mask[0] contain masks for elements > thresh_sml
@@ -580,12 +588,12 @@ void bli_dnorm2fv_unb_var1_avx512
             // Setting booleans to check for underflow/overflow handling
             // In case of having values outside threshold, the associated
             // bit in k_mask[4] is 0.
-            // Thus, truth_val[0] = 0 if x_vec[0] has elements outside thresholds
+            // Thus, truth_val[0] = 0 if x_vec[0].v has elements outside thresholds
             truth_val[0] = _kortestc_mask8_u8( k_mask[4], k_mask[4] );
 
             // Computing using masked fmadds, that carries over values from
             // accumulator register if the mask bit is 0
-            sum_med_vec[0] = _mm512_mask3_fmadd_pd( x_vec[0], x_vec[0], sum_med_vec[0], k_mask[4] );
+            sum_med_vec[0].v = _mm512_mask3_fmadd_pd( x_vec[0].v, x_vec[0].v, sum_med_vec[0].v, k_mask[4] );
 
             // In case of having elements outside the threshold
             if( !truth_val[0] )
@@ -595,18 +603,18 @@ void bli_dnorm2fv_unb_var1_avx512
                 // k_mask[0 ... 1] contain masks for elements > thresh_sml. This would
                 // include both elements < thresh_big and >= thresh_big
                 // XOR on these will produce masks for elements >= thresh_big
-                // That is, k_mask[4][i] = 1 if x_vec[0][i] >= thresh_big_vec
-                //          k_mask[5][i] = 1 if x_vec[1][i] >= thresh_big_vec
+                // That is, k_mask[4][i] = 1 if x_vec[0].v[i] >= thresh_big_vec.v
+                //          k_mask[5][i] = 1 if x_vec[1].v[i] >= thresh_big_vec.v
                 k_mask[4] = _kxor_mask8( k_mask[0], k_mask[4] );
 
                 // Inverting k_mask[0 ... 1], to obtain masks for elements <= thresh_sml
-                // That is, k_mask[0][i] = 1 if x_vec[0][i] <= thresh_sml_vec
-                //          k_mask[1][i] = 1 if x_vec[1][i] <= thresh_sml_vec
+                // That is, k_mask[0][i] = 1 if x_vec[0].v[i] <= thresh_sml_vec.v
+                //          k_mask[1][i] = 1 if x_vec[1].v[i] <= thresh_sml_vec.v
                 k_mask[0] = _knot_mask8( k_mask[0] );
 
                 // Checking whether we have values greater than thresh_big
                 // The truth_val is set to 0 if any bit in the mask is 1
-                // Thus, truth_val[2] = 0 if x_vec[0] or x_vec[1] has elements >= thresh_big_vec
+                // Thus, truth_val[2] = 0 if x_vec[0].v or x_vec[1].v has elements >= thresh_big_vec.v
                 truth_val[2] = _kortestz_mask8_u8( k_mask[4], k_mask[4] );
 
                 // In case of having values greater than thresh_big
@@ -620,10 +628,10 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are greater than thresh_big
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[4], scale_big_vec, x_vec[0] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[4], scale_big_vec.v, x_vec[0].v );
 
                     // Square and add the elements to the accumulators
-                    sum_big_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_big_vec[0] );
+                    sum_big_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_big_vec[0].v );
                 }
                 else if( !isbig )
                 {
@@ -632,32 +640,35 @@ void bli_dnorm2fv_unb_var1_avx512
                     // are lesser than thresh_sml, if needed
 
                     // Scale the required elements in x_vec[0..3] by scale_smal
-                    temp[0] = _mm512_mask_mul_pd( zero_reg, k_mask[0], scale_sml_vec, x_vec[0] );
+                    temp[0].v = _mm512_mask_mul_pd( zero_reg.v, k_mask[0], scale_sml_vec.v, x_vec[0].v );
 
                     // Square and add the elements to the accumulators
-                    sum_sml_vec[0] = _mm512_fmadd_pd( temp[0], temp[0], sum_sml_vec[0] );
+                    sum_sml_vec[0].v = _mm512_fmadd_pd( temp[0].v, temp[0].v, sum_sml_vec[0].v );
                 }
             }
         }
 
         // Reduction step
         // Combining the results of accumulators for each category
-        sum_med_vec[0] = _mm512_add_pd( sum_med_vec[0], sum_med_vec[1] );
-        sum_med_vec[2] = _mm512_add_pd( sum_med_vec[2], sum_med_vec[3] );
-        sum_med_vec[0] = _mm512_add_pd( sum_med_vec[0], sum_med_vec[2] );
+        sum_med_vec[0].v = _mm512_add_pd( sum_med_vec[0].v, sum_med_vec[1].v );
+        sum_med_vec[2].v = _mm512_add_pd( sum_med_vec[2].v, sum_med_vec[3].v );
+        sum_med_vec[0].v = _mm512_add_pd( sum_med_vec[0].v, sum_med_vec[2].v );
 
-        sum_big_vec[0] = _mm512_add_pd( sum_big_vec[0], sum_big_vec[1] );
-        sum_big_vec[2] = _mm512_add_pd( sum_big_vec[2], sum_big_vec[3] );
-        sum_big_vec[0] = _mm512_add_pd( sum_big_vec[0], sum_big_vec[2] );
+        sum_big_vec[0].v = _mm512_add_pd( sum_big_vec[0].v, sum_big_vec[1].v );
+        sum_big_vec[2].v = _mm512_add_pd( sum_big_vec[2].v, sum_big_vec[3].v );
+        sum_big_vec[0].v = _mm512_add_pd( sum_big_vec[0].v, sum_big_vec[2].v );
 
-        sum_sml_vec[0] = _mm512_add_pd( sum_sml_vec[0], sum_sml_vec[1] );
-        sum_sml_vec[2] = _mm512_add_pd( sum_sml_vec[2], sum_sml_vec[3] );
-        sum_sml_vec[0] = _mm512_add_pd( sum_sml_vec[0], sum_sml_vec[2] );
+        sum_sml_vec[0].v = _mm512_add_pd( sum_sml_vec[0].v, sum_sml_vec[1].v );
+        sum_sml_vec[2].v = _mm512_add_pd( sum_sml_vec[2].v, sum_sml_vec[3].v );
+        sum_sml_vec[0].v = _mm512_add_pd( sum_sml_vec[0].v, sum_sml_vec[2].v );
 
         // Final accumulation on the scalars
-        sum_sml += _mm512_reduce_add_pd( sum_sml_vec[0] );
-        sum_med += _mm512_reduce_add_pd( sum_med_vec[0] );
-        sum_big += _mm512_reduce_add_pd( sum_big_vec[0] );
+        sum_sml += sum_sml_vec[0].d[0] + sum_sml_vec[0].d[1] + sum_sml_vec[0].d[2] + sum_sml_vec[0].d[3]
+                 + sum_sml_vec[0].d[4] + sum_sml_vec[0].d[5] + sum_sml_vec[0].d[6] + sum_sml_vec[0].d[7]; 
+        sum_med += sum_med_vec[0].d[0] + sum_med_vec[0].d[1] + sum_med_vec[0].d[2] + sum_med_vec[0].d[3]
+                 + sum_med_vec[0].d[4] + sum_med_vec[0].d[5] + sum_med_vec[0].d[6] + sum_med_vec[0].d[7]; 
+        sum_big += sum_big_vec[0].d[0] + sum_big_vec[0].d[1] + sum_big_vec[0].d[2] + sum_big_vec[0].d[3]
+                 + sum_big_vec[0].d[4] + sum_big_vec[0].d[5] + sum_big_vec[0].d[6] + sum_big_vec[0].d[7]; 
     }
     // Dealing with non-unit strided inputs
     else