From 1416ab6b5ed2f8bdcbfb11f9d1a47a07b22ed9f9 Mon Sep 17 00:00:00 2001
From: Eleni Vlachopoulou <evlachop@amd.com>
Date: Sat, 4 Mar 2023 00:01:38 +0530
Subject: [PATCH] Bugfix on the memory access for nrm2 AVX2 implementations.

- Since the definition of negative increments is different between BLAS and BLIS,
  there was bug on how the memory was accessed when we were copying the elements
  of a vector with negative increments. Updated the code under the assumption that
  when negative increments are set, the vector is being accessed starting from the end.
  For the BLAS interface, there is an intermediate conversion before calling into the blis layer.

Change-Id: I08343472b418733fad6f7add9e90aa96cdf68285
AMD-Internal: [SWLCSG-1900]
---
 kernels/zen/1/bli_norm2_zen_int.c | 6106 ++++++++++++++---------------
 1 file changed, 3039 insertions(+), 3067 deletions(-)

diff --git a/kernels/zen/1/bli_norm2_zen_int.c b/kernels/zen/1/bli_norm2_zen_int.c
index ad61f8c3b..90fc9aee2 100644
--- a/kernels/zen/1/bli_norm2_zen_int.c
+++ b/kernels/zen/1/bli_norm2_zen_int.c
@@ -1,3067 +1,3039 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021 - 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#include "immintrin.h"
-#include "blis.h"
-
-// Union data structure to access AVX registers
-// One 256-bit AVX register holds 8 SP elements. 
-typedef union
-{
-    __m256  v;
-    float   f[8] __attribute__( ( aligned( 64 ) ) );
-} v8sf_t;
-
-// Union data structure to access AVX registers
-// One 256-bit AVX register holds 4 DP elements. 
-typedef union
-{
-    __m256d v;
-    double  d[4] __attribute__( ( aligned( 64 ) ) );
-} v4df_t;
-
-// Return a mask which indicates either:
-// v <= t or v >= T
-#define CMP256_sf( v, t, T ) \
-	_mm256_or_ps( _mm256_cmp_ps( v, t, _CMP_LE_OS ), _mm256_cmp_ps( v, T, _CMP_GE_OS ) );
-
-#define CMP256_df( v, t, T ) \
-	_mm256_or_pd( _mm256_cmp_pd( v, t, _CMP_LE_OS ), _mm256_cmp_pd( v, T, _CMP_GE_OS ) );
-
-// Returns true if any of the values in the mask vector a is true, 
-// and false, otherwise.
-// In more detail, __mm256_testz_ps() performs the bitwise (a AND b) operation and returns:
-//    1 if the sign bit of all bitwise operations is 0,
-//    0 if at least one of the sign bits of each bitwise operation is 1.
-// The sign bit of (a AND a) will be 1 iff the sign bit of a is 1, and 0 otherwise.
-// That means that __mm256_testz_ps(a,a) returns:
-//    1 if the sign bit of all elements in a is 0,
-//    0 if at least one of the sign bits of a is 1.
-// Because of the negation, bli_horizontal_or_sf() returns:
-//    0 if the sign bit of all elements in a is 0,
-//    1 if at least one of the sign bits of a is 1. 
-// Since a is the result of a masking operation, bli_horizontal_or_sf() returns:
-//    0 (false) if the mask is false for all elements in a,
-//    1 (true)  if the mask is true for at least one element in a.
-static inline bool bli_horizontal_or_sf( __m256  a ) { return ! _mm256_testz_ps( a, a ); }
-static inline bool bli_horizontal_or_df( __m256d a ) { return ! _mm256_testz_pd( a, a ); }
-
-float horizontal_add_sf(__m256 const a) {
-    __m256 t1 = _mm256_hadd_ps(a, a);
-    __m256 t2 = _mm256_hadd_ps(t1,t1);
-    __m128 t3 = _mm256_extractf128_ps(t2,1);
-    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
-    return _mm_cvtss_f32(t4); // sign extend to 32 bits
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_snorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       float*   x, inc_t incx,
-       float* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-    
-    float sumsq = 0.0f;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    float  *x_buf = x;
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" float elements in vector x.
-        size_t buffer_size = n * sizeof( float );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(float)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    float *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    float sum_sml = 0.0f;
-    // Accumulator for medium values; no scaling required.
-    float sum_med = 0.0f;
-    // Accumulator for big values; using scaling to avoid overflow.
-    float sum_big = 0.0f;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
-    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
-    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
-    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
-
-    float scale = 1.0f;
-    float abs_chi;
-    bool isbig = false;
-
-    if ( n >= 64 )
-    {
-        // Constants used for comparisons.
-        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
-        temp.v = _mm256_set1_ps( -0.0f );
-        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
-        thres_big_vec.v = _mm256_set1_ps( thres_big );
-        v8sf_t x0v, x1v, x2v, x3v;
-        v8sf_t y0v, y1v, y2v, y3v;
-        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
-        zerov.v  = _mm256_setzero_ps();
-
-        // Partial sums used for scaling.
-        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
-        sum_sml_vec0.v = _mm256_setzero_ps();
-        sum_sml_vec1.v = _mm256_setzero_ps();
-        sum_sml_vec2.v = _mm256_setzero_ps();
-        sum_sml_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
-        sum_med_vec0.v = _mm256_setzero_ps();
-        sum_med_vec1.v = _mm256_setzero_ps();
-        sum_med_vec2.v = _mm256_setzero_ps();
-        sum_med_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
-        sum_big_vec0.v = _mm256_setzero_ps();
-        sum_big_vec1.v = _mm256_setzero_ps();
-        sum_big_vec2.v = _mm256_setzero_ps();
-        sum_big_vec3.v = _mm256_setzero_ps();
-
-        for (; ( i + 32 ) <= n; i = i + 32)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-            x1v.v = _mm256_loadu_ps( xt + 8 );
-            x2v.v = _mm256_loadu_ps( xt + 16 );
-            x3v.v = _mm256_loadu_ps( xt + 24 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-                
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {                    
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec3.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 32;
-        }
-
-        for (; ( i + 24 ) <= n; i = i + 24)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-            x1v.v = _mm256_loadu_ps( xt + 8 );
-            x2v.v = _mm256_loadu_ps( xt + 16 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            
-            xt += 24;
-        }
-
-        for (; ( i + 16 ) <= n; i = i + 16)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-            x1v.v = _mm256_loadu_ps( xt + 8 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 16;
-        }
-        
-        // This seems to not be improving performance.
-        #if 0
-        for (; ( i + 8 ) <= n; i = i + 8)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 8;
-        }
-        #endif
-
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
-        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
-
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
-        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
-        sum_med = horizontal_add_sf(sum_med_vec0.v);
-
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
-        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
-        sum_big = horizontal_add_sf(sum_big_vec0.v);
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            abs_chi = bli_fabs( *xt );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-            xt++;
-        }
-    }
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {
-        
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-    
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0f )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0f / scale_big;
-        sumsq = sum_big;
-    }
-    else if ( sum_sml > 0.0f )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0f )
-        {
-            sum_med = sqrtf( sum_med );
-            sum_sml = sqrtf( sum_sml ) / scale_sml;
-            float ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0f;
-            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0f / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0f;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrtf( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_scnorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       scomplex*   x, inc_t incx,
-       float* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    float sumsq = 0.0f;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    scomplex  *x_buf = x;
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" scomplex elements in vector x.
-        size_t buffer_size = n * sizeof( scomplex );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(scomplex)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    scomplex *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    float sum_sml = 0.0f;
-   // Accumulator for medium values; no scaling required.
-    float sum_med = 0.0f;
-    // Accumulator for big values; using scaling to avoid overflow.
-    float sum_big = 0.0f;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
-    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
-    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
-    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
-
-    float scale = 1.0f;
-    float abs_chi;
-    bool isbig = false;
-
-    if ( n >= 64 )
-    {
-        // Constants used for comparisons.
-        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
-        temp.v = _mm256_set1_ps( -0.0f );
-        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
-        thres_big_vec.v = _mm256_set1_ps( thres_big );
-        v8sf_t x0v, x1v, x2v, x3v;
-        v8sf_t y0v, y1v, y2v, y3v;
-        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
-        zerov.v  = _mm256_setzero_ps();
-
-        // Partial sums used for scaling.
-        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
-        sum_sml_vec0.v = _mm256_setzero_ps();
-        sum_sml_vec1.v = _mm256_setzero_ps();
-        sum_sml_vec2.v = _mm256_setzero_ps();
-        sum_sml_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
-        sum_med_vec0.v = _mm256_setzero_ps();
-        sum_med_vec1.v = _mm256_setzero_ps();
-        sum_med_vec2.v = _mm256_setzero_ps();
-        sum_med_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
-        sum_big_vec0.v = _mm256_setzero_ps();
-        sum_big_vec1.v = _mm256_setzero_ps();
-        sum_big_vec2.v = _mm256_setzero_ps();
-        sum_big_vec3.v = _mm256_setzero_ps();
-
-        for (; ( i + 16 ) <= n; i = i + 16)
-        {
-            x0v.v = _mm256_loadu_ps( (float*) xt );
-            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
-            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
-            x3v.v = _mm256_loadu_ps( (float*) (xt + 12) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-                
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {                    
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec3.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 16;
-        }
-
-        for (; ( i + 12 ) <= n; i = i + 12)
-        {
-            x0v.v = _mm256_loadu_ps( (float*)xt );
-            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
-            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            
-            xt += 12;
-        }
-        
-        for (; ( i + 8 ) <= n; i = i + 8)
-        {
-            x0v.v = _mm256_loadu_ps( (float*)xt );
-            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 8;
-        }
-        // This seems to not be improving performance.
-        #if 0
-        for (; ( i + 4 ) <= n; i = i + 4)
-        {
-            x0v.v = _mm256_loadu_ps( (float*)xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 4;
-        }
-        #endif
-
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
-        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
-
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
-        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
-        sum_med = horizontal_add_sf(sum_med_vec0.v);
-
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
-        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
-        sum_big = horizontal_add_sf(sum_big_vec0.v);
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    double chi_r, chi_i;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            // Get real and imaginary component of the vector element.            
-            bli_csgets(*xt, chi_r, chi_i);
-            // Start with accumulating the real component of the vector element.
-            abs_chi = bli_fabs( chi_r );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-            // Accumulate the imaginary component of the vector element.
-            abs_chi = bli_fabs( chi_i );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            xt++;
-        }
-    }
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-    
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0f )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0f / scale_big;
-        sumsq = sum_big;
-    }
-    else if ( sum_sml > 0.0f )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0f )
-        {
-            sum_med = sqrtf( sum_med );
-            sum_sml = sqrtf( sum_sml ) / scale_sml;
-            float ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0f;
-            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0f / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0f;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrtf( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_dnorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       double*   x, inc_t incx,
-       double* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    double sumsq = 0;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    double  *x_buf = x;
-
-    // Early return if n<=0 or incx=0
-    if ( ( n <= 0) || ( incx == 0 ) )
-    {
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" double elements in vector x.
-        size_t buffer_size = n * sizeof( double );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(double)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    double *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    double sum_sml = 0;
-   // Accumulator for medium values; no scaling required.
-    double sum_med = 0;
-    // Accumulator for big values; using scaling to avoid overflow.
-    double sum_big = 0;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
-    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
-    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
-    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
-
-    double scale;
-    double abs_chi;
-    bool isbig = false;
-
-    if ( n > 4 )
-    {
-        // Constants used for comparisons.
-        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
-        temp.v = _mm256_set1_pd( -0.0 );
-        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
-        thres_big_vec.v = _mm256_set1_pd( thres_big );
-        v4df_t x0v, x1v, mask_vec0, mask_vec1;
-        zerov.v  = _mm256_setzero_pd();
-
-        // Partial sums used for scaling.
-        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
-        sum_med_vec0.v = _mm256_setzero_pd();
-        sum_big_vec0.v = _mm256_setzero_pd();
-        sum_sml_vec0.v = _mm256_setzero_pd();
-        sum_med_vec1.v = _mm256_setzero_pd();
-        sum_big_vec1.v = _mm256_setzero_pd();
-        sum_sml_vec1.v = _mm256_setzero_pd();
-
-        for (; ( i + 8 ) <= n; i = i + 8)
-        {
-            x0v.v = _mm256_loadu_pd( xt );
-            x1v.v = _mm256_loadu_pd( xt + 4 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); 
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); 
-                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            xt += 8;
-        }
-
-        for ( ; ( i + 4 ) <= n; i = i + 4 )
-        {
-            x0v.v = _mm256_loadu_pd( xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-            xt += 4;
-        }
-
-        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
-        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
-
-        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
-                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
-        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
-                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
-        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
-                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            abs_chi = bli_fabs( *xt );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-            xt++;
-        }
-    }
-
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {        
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0 )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0 / scale_big;
-        sumsq = sum_big;
-    }
-
-    else if ( sum_sml > 0.0 )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0 )
-        {
-            sum_med = sqrt( sum_med );
-            sum_sml = sqrt( sum_sml ) / scale_sml;
-            double ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0;
-            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0 / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrt( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_dznorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       dcomplex*   x, inc_t incx,
-       double* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    double sumsq = 0;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    dcomplex  *x_buf = x;
-
-    // Early return if n<=0 or incx=0
-    if ( ( n <= 0) || ( incx == 0 ) )
-    {
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" dcomplex elements in vector x.
-        size_t buffer_size = n * sizeof( dcomplex );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dznorm2fv_unb_var1(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(dcomplex)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    dcomplex *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    double sum_sml = 0;
-   // Accumulator for medium values; no scaling required.
-    double sum_med = 0;
-    // Accumulator for big values; using scaling to avoid overflow.
-    double sum_big = 0;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
-    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
-    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
-    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
-
-    double scale;
-    double abs_chi;
-    bool isbig = false;
-
-    if ( n > 2 )
-    {
-        // Constants used for comparisons.
-        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
-        temp.v = _mm256_set1_pd( -0.0 );
-        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
-        thres_big_vec.v = _mm256_set1_pd( thres_big );
-        v4df_t x0v, x1v, mask_vec0, mask_vec1;
-        zerov.v  = _mm256_setzero_pd();
-
-        // Partial sums used for scaling.
-        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
-        sum_med_vec0.v = _mm256_setzero_pd();
-        sum_big_vec0.v = _mm256_setzero_pd();
-        sum_sml_vec0.v = _mm256_setzero_pd();
-        sum_med_vec1.v = _mm256_setzero_pd();
-        sum_big_vec1.v = _mm256_setzero_pd();
-        sum_sml_vec1.v = _mm256_setzero_pd();
-
-        for (; ( i + 4 ) <= n; i = i + 4)
-        {
-            x0v.v = _mm256_loadu_pd( (double*) xt );
-            x1v.v = _mm256_loadu_pd( (double*) (xt + 2) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            xt += 4;
-        }
-
-        for ( ; ( i + 2 ) <= n; i = i + 2 )
-        {
-            x0v.v = _mm256_loadu_pd( (double*) xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-            xt += 2;
-        }
-
-        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
-        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
-
-        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
-                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
-        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
-                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
-        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
-                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    double chi_r, chi_i;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            // Get real and imaginary component of the vector element.
-            bli_zdgets(*xt, chi_r, chi_i);
-
-            // Start with accumulating the real component of the vector element.
-            abs_chi = bli_fabs( chi_r );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            // Accumulate the imaginary component of the vector element.
-            abs_chi = bli_fabs( chi_i );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            xt++;
-        }
-    }
-
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {        
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0 )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0 / scale_big;
-        sumsq = sum_big;
-    }
-
-    else if ( sum_sml > 0.0 )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0 )
-        {
-            sum_med = sqrt( sum_med );
-            sum_sml = sqrt( sum_sml ) / scale_sml;
-            double ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0;
-            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0 / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrt( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dznorm2fv_unb_var1(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "immintrin.h"
+#include "blis.h"
+
+// Union data structure to access AVX registers
+// One 256-bit AVX register holds 8 SP elements. 
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__( ( aligned( 64 ) ) );
+} v8sf_t;
+
+// Union data structure to access AVX registers
+// One 256-bit AVX register holds 4 DP elements. 
+typedef union
+{
+    __m256d v;
+    double  d[4] __attribute__( ( aligned( 64 ) ) );
+} v4df_t;
+
+// Return a mask which indicates either:
+// v <= t or v >= T
+#define CMP256_sf( v, t, T ) \
+	_mm256_or_ps( _mm256_cmp_ps( v, t, _CMP_LE_OS ), _mm256_cmp_ps( v, T, _CMP_GE_OS ) );
+
+#define CMP256_df( v, t, T ) \
+	_mm256_or_pd( _mm256_cmp_pd( v, t, _CMP_LE_OS ), _mm256_cmp_pd( v, T, _CMP_GE_OS ) );
+
+// Returns true if any of the values in the mask vector a is true, 
+// and false, otherwise.
+// In more detail, __mm256_testz_ps() performs the bitwise (a AND b) operation and returns:
+//    1 if the sign bit of all bitwise operations is 0,
+//    0 if at least one of the sign bits of each bitwise operation is 1.
+// The sign bit of (a AND a) will be 1 iff the sign bit of a is 1, and 0 otherwise.
+// That means that __mm256_testz_ps(a,a) returns:
+//    1 if the sign bit of all elements in a is 0,
+//    0 if at least one of the sign bits of a is 1.
+// Because of the negation, bli_horizontal_or_sf() returns:
+//    0 if the sign bit of all elements in a is 0,
+//    1 if at least one of the sign bits of a is 1. 
+// Since a is the result of a masking operation, bli_horizontal_or_sf() returns:
+//    0 (false) if the mask is false for all elements in a,
+//    1 (true)  if the mask is true for at least one element in a.
+static inline bool bli_horizontal_or_sf( __m256  a ) { return ! _mm256_testz_ps( a, a ); }
+static inline bool bli_horizontal_or_df( __m256d a ) { return ! _mm256_testz_pd( a, a ); }
+
+float horizontal_add_sf(__m256 const a) {
+    __m256 t1 = _mm256_hadd_ps(a, a);
+    __m256 t2 = _mm256_hadd_ps(t1,t1);
+    __m128 t3 = _mm256_extractf128_ps(t2,1);
+    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
+    return _mm_cvtss_f32(t4); // sign extend to 32 bits
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_snorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       float*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+    
+    float sumsq = 0.0f;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    float  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" float elements in vector x.
+        size_t buffer_size = n * sizeof( float );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(float)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    float *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    float sum_sml = 0.0f;
+    // Accumulator for medium values; no scaling required.
+    float sum_med = 0.0f;
+    // Accumulator for big values; using scaling to avoid overflow.
+    float sum_big = 0.0f;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
+    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
+    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
+    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
+
+    float scale = 1.0f;
+    float abs_chi;
+    bool isbig = false;
+
+    if ( n >= 64 )
+    {
+        // Constants used for comparisons.
+        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
+        temp.v = _mm256_set1_ps( -0.0f );
+        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
+        thres_big_vec.v = _mm256_set1_ps( thres_big );
+        v8sf_t x0v, x1v, x2v, x3v;
+        v8sf_t y0v, y1v, y2v, y3v;
+        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
+        zerov.v  = _mm256_setzero_ps();
+
+        // Partial sums used for scaling.
+        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
+        sum_sml_vec0.v = _mm256_setzero_ps();
+        sum_sml_vec1.v = _mm256_setzero_ps();
+        sum_sml_vec2.v = _mm256_setzero_ps();
+        sum_sml_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
+        sum_med_vec0.v = _mm256_setzero_ps();
+        sum_med_vec1.v = _mm256_setzero_ps();
+        sum_med_vec2.v = _mm256_setzero_ps();
+        sum_med_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
+        sum_big_vec0.v = _mm256_setzero_ps();
+        sum_big_vec1.v = _mm256_setzero_ps();
+        sum_big_vec2.v = _mm256_setzero_ps();
+        sum_big_vec3.v = _mm256_setzero_ps();
+
+        for (; ( i + 32 ) <= n; i = i + 32)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+            x2v.v = _mm256_loadu_ps( xt + 16 );
+            x3v.v = _mm256_loadu_ps( xt + 24 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+                
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {                    
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec3.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 32;
+        }
+
+        for (; ( i + 24 ) <= n; i = i + 24)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+            x2v.v = _mm256_loadu_ps( xt + 16 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            
+            xt += 24;
+        }
+
+        for (; ( i + 16 ) <= n; i = i + 16)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 16;
+        }
+        
+        // This seems to not be improving performance.
+        #if 0
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 8;
+        }
+        #endif
+
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
+        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
+
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
+        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
+        sum_med = horizontal_add_sf(sum_med_vec0.v);
+
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
+        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
+        sum_big = horizontal_add_sf(sum_big_vec0.v);
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            abs_chi = bli_fabs( *xt );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            xt++;
+        }
+    }
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {
+        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+    
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0f )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0f / scale_big;
+        sumsq = sum_big;
+    }
+    else if ( sum_sml > 0.0f )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0f )
+        {
+            sum_med = sqrtf( sum_med );
+            sum_sml = sqrtf( sum_sml ) / scale_sml;
+            float ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0f;
+            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0f / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0f;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrtf( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_scnorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       scomplex*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    float sumsq = 0.0f;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    scomplex  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" scomplex elements in vector x.
+        size_t buffer_size = n * sizeof( scomplex );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(scomplex)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    scomplex *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    float sum_sml = 0.0f;
+   // Accumulator for medium values; no scaling required.
+    float sum_med = 0.0f;
+    // Accumulator for big values; using scaling to avoid overflow.
+    float sum_big = 0.0f;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
+    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
+    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
+    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
+
+    float scale = 1.0f;
+    float abs_chi;
+    bool isbig = false;
+
+    if ( n >= 64 )
+    {
+        // Constants used for comparisons.
+        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
+        temp.v = _mm256_set1_ps( -0.0f );
+        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
+        thres_big_vec.v = _mm256_set1_ps( thres_big );
+        v8sf_t x0v, x1v, x2v, x3v;
+        v8sf_t y0v, y1v, y2v, y3v;
+        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
+        zerov.v  = _mm256_setzero_ps();
+
+        // Partial sums used for scaling.
+        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
+        sum_sml_vec0.v = _mm256_setzero_ps();
+        sum_sml_vec1.v = _mm256_setzero_ps();
+        sum_sml_vec2.v = _mm256_setzero_ps();
+        sum_sml_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
+        sum_med_vec0.v = _mm256_setzero_ps();
+        sum_med_vec1.v = _mm256_setzero_ps();
+        sum_med_vec2.v = _mm256_setzero_ps();
+        sum_med_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
+        sum_big_vec0.v = _mm256_setzero_ps();
+        sum_big_vec1.v = _mm256_setzero_ps();
+        sum_big_vec2.v = _mm256_setzero_ps();
+        sum_big_vec3.v = _mm256_setzero_ps();
+
+        for (; ( i + 16 ) <= n; i = i + 16)
+        {
+            x0v.v = _mm256_loadu_ps( (float*) xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
+            x3v.v = _mm256_loadu_ps( (float*) (xt + 12) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+                
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {                    
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec3.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 16;
+        }
+
+        for (; ( i + 12 ) <= n; i = i + 12)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            
+            xt += 12;
+        }
+        
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 8;
+        }
+        // This seems to not be improving performance.
+        #if 0
+        for (; ( i + 4 ) <= n; i = i + 4)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 4;
+        }
+        #endif
+
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
+        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
+
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
+        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
+        sum_med = horizontal_add_sf(sum_med_vec0.v);
+
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
+        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
+        sum_big = horizontal_add_sf(sum_big_vec0.v);
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    double chi_r, chi_i;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            // Get real and imaginary component of the vector element.            
+            bli_csgets(*xt, chi_r, chi_i);
+            // Start with accumulating the real component of the vector element.
+            abs_chi = bli_fabs( chi_r );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            // Accumulate the imaginary component of the vector element.
+            abs_chi = bli_fabs( chi_i );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            xt++;
+        }
+    }
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+    
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0f )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0f / scale_big;
+        sumsq = sum_big;
+    }
+    else if ( sum_sml > 0.0f )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0f )
+        {
+            sum_med = sqrtf( sum_med );
+            sum_sml = sqrtf( sum_sml ) / scale_sml;
+            float ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0f;
+            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0f / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0f;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrtf( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_dnorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       double*   x, inc_t incx,
+       double* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    double sumsq = 0;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    double  *x_buf = x;
+
+    // Early return if n<=0 or incx=0
+    if ( ( n <= 0) || ( incx == 0 ) )
+    {
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" double elements in vector x.
+        size_t buffer_size = n * sizeof( double );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(double)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    double *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    double sum_sml = 0;
+   // Accumulator for medium values; no scaling required.
+    double sum_med = 0;
+    // Accumulator for big values; using scaling to avoid overflow.
+    double sum_big = 0;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
+    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
+    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
+    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
+
+    double scale;
+    double abs_chi;
+    bool isbig = false;
+
+    if ( n > 4 )
+    {
+        // Constants used for comparisons.
+        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
+        temp.v = _mm256_set1_pd( -0.0 );
+        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
+        thres_big_vec.v = _mm256_set1_pd( thres_big );
+        v4df_t x0v, x1v, mask_vec0, mask_vec1;
+        zerov.v  = _mm256_setzero_pd();
+
+        // Partial sums used for scaling.
+        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
+        sum_med_vec0.v = _mm256_setzero_pd();
+        sum_big_vec0.v = _mm256_setzero_pd();
+        sum_sml_vec0.v = _mm256_setzero_pd();
+        sum_med_vec1.v = _mm256_setzero_pd();
+        sum_big_vec1.v = _mm256_setzero_pd();
+        sum_sml_vec1.v = _mm256_setzero_pd();
+
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_pd( xt );
+            x1v.v = _mm256_loadu_pd( xt + 4 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); 
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); 
+                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            xt += 8;
+        }
+
+        for ( ; ( i + 4 ) <= n; i = i + 4 )
+        {
+            x0v.v = _mm256_loadu_pd( xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+            xt += 4;
+        }
+
+        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
+        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
+
+        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
+                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
+        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
+                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
+        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
+                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            abs_chi = bli_fabs( *xt );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            xt++;
+        }
+    }
+
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0 )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0 / scale_big;
+        sumsq = sum_big;
+    }
+
+    else if ( sum_sml > 0.0 )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0 )
+        {
+            sum_med = sqrt( sum_med );
+            sum_sml = sqrt( sum_sml ) / scale_sml;
+            double ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0;
+            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0 / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrt( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_dznorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       dcomplex*   x, inc_t incx,
+       double* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    double sumsq = 0;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    dcomplex  *x_buf = x;
+
+    // Early return if n<=0 or incx=0
+    if ( ( n <= 0) || ( incx == 0 ) )
+    {
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" dcomplex elements in vector x.
+        size_t buffer_size = n * sizeof( dcomplex );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dznorm2fv_unb_var1(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(dcomplex)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    dcomplex *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    double sum_sml = 0;
+   // Accumulator for medium values; no scaling required.
+    double sum_med = 0;
+    // Accumulator for big values; using scaling to avoid overflow.
+    double sum_big = 0;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
+    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
+    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
+    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
+
+    double scale;
+    double abs_chi;
+    bool isbig = false;
+
+    if ( n > 2 )
+    {
+        // Constants used for comparisons.
+        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
+        temp.v = _mm256_set1_pd( -0.0 );
+        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
+        thres_big_vec.v = _mm256_set1_pd( thres_big );
+        v4df_t x0v, x1v, mask_vec0, mask_vec1;
+        zerov.v  = _mm256_setzero_pd();
+
+        // Partial sums used for scaling.
+        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
+        sum_med_vec0.v = _mm256_setzero_pd();
+        sum_big_vec0.v = _mm256_setzero_pd();
+        sum_sml_vec0.v = _mm256_setzero_pd();
+        sum_med_vec1.v = _mm256_setzero_pd();
+        sum_big_vec1.v = _mm256_setzero_pd();
+        sum_sml_vec1.v = _mm256_setzero_pd();
+
+        for (; ( i + 4 ) <= n; i = i + 4)
+        {
+            x0v.v = _mm256_loadu_pd( (double*) xt );
+            x1v.v = _mm256_loadu_pd( (double*) (xt + 2) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            xt += 4;
+        }
+
+        for ( ; ( i + 2 ) <= n; i = i + 2 )
+        {
+            x0v.v = _mm256_loadu_pd( (double*) xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+            xt += 2;
+        }
+
+        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
+        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
+
+        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
+                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
+        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
+                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
+        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
+                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    double chi_r, chi_i;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            // Get real and imaginary component of the vector element.
+            bli_zdgets(*xt, chi_r, chi_i);
+
+            // Start with accumulating the real component of the vector element.
+            abs_chi = bli_fabs( chi_r );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            // Accumulate the imaginary component of the vector element.
+            abs_chi = bli_fabs( chi_i );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            xt++;
+        }
+    }
+
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0 )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0 / scale_big;
+        sumsq = sum_big;
+    }
+
+    else if ( sum_sml > 0.0 )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0 )
+        {
+            sum_med = sqrt( sum_med );
+            sum_sml = sqrt( sum_sml ) / scale_sml;
+            double ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0;
+            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0 / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrt( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dznorm2fv_unb_var1(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}