diff --git a/kernels/zen/1/bli_norm2_zen_int.c b/kernels/zen/1/bli_norm2_zen_int.c
index ad61f8c3b..90fc9aee2 100644
--- a/kernels/zen/1/bli_norm2_zen_int.c
+++ b/kernels/zen/1/bli_norm2_zen_int.c
@@ -1,3067 +1,3039 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021 - 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#include "immintrin.h"
-#include "blis.h"
-
-// Union data structure to access AVX registers
-// One 256-bit AVX register holds 8 SP elements. 
-typedef union
-{
-    __m256  v;
-    float   f[8] __attribute__( ( aligned( 64 ) ) );
-} v8sf_t;
-
-// Union data structure to access AVX registers
-// One 256-bit AVX register holds 4 DP elements. 
-typedef union
-{
-    __m256d v;
-    double  d[4] __attribute__( ( aligned( 64 ) ) );
-} v4df_t;
-
-// Return a mask which indicates either:
-// v <= t or v >= T
-#define CMP256_sf( v, t, T ) \
-	_mm256_or_ps( _mm256_cmp_ps( v, t, _CMP_LE_OS ), _mm256_cmp_ps( v, T, _CMP_GE_OS ) );
-
-#define CMP256_df( v, t, T ) \
-	_mm256_or_pd( _mm256_cmp_pd( v, t, _CMP_LE_OS ), _mm256_cmp_pd( v, T, _CMP_GE_OS ) );
-
-// Returns true if any of the values in the mask vector a is true, 
-// and false, otherwise.
-// In more detail, __mm256_testz_ps() performs the bitwise (a AND b) operation and returns:
-//    1 if the sign bit of all bitwise operations is 0,
-//    0 if at least one of the sign bits of each bitwise operation is 1.
-// The sign bit of (a AND a) will be 1 iff the sign bit of a is 1, and 0 otherwise.
-// That means that __mm256_testz_ps(a,a) returns:
-//    1 if the sign bit of all elements in a is 0,
-//    0 if at least one of the sign bits of a is 1.
-// Because of the negation, bli_horizontal_or_sf() returns:
-//    0 if the sign bit of all elements in a is 0,
-//    1 if at least one of the sign bits of a is 1. 
-// Since a is the result of a masking operation, bli_horizontal_or_sf() returns:
-//    0 (false) if the mask is false for all elements in a,
-//    1 (true)  if the mask is true for at least one element in a.
-static inline bool bli_horizontal_or_sf( __m256  a ) { return ! _mm256_testz_ps( a, a ); }
-static inline bool bli_horizontal_or_df( __m256d a ) { return ! _mm256_testz_pd( a, a ); }
-
-float horizontal_add_sf(__m256 const a) {
-    __m256 t1 = _mm256_hadd_ps(a, a);
-    __m256 t2 = _mm256_hadd_ps(t1,t1);
-    __m128 t3 = _mm256_extractf128_ps(t2,1);
-    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
-    return _mm_cvtss_f32(t4); // sign extend to 32 bits
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_snorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       float*   x, inc_t incx,
-       float* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-    
-    float sumsq = 0.0f;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    float  *x_buf = x;
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" float elements in vector x.
-        size_t buffer_size = n * sizeof( float );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(float)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    float *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    float sum_sml = 0.0f;
-    // Accumulator for medium values; no scaling required.
-    float sum_med = 0.0f;
-    // Accumulator for big values; using scaling to avoid overflow.
-    float sum_big = 0.0f;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
-    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
-    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
-    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
-
-    float scale = 1.0f;
-    float abs_chi;
-    bool isbig = false;
-
-    if ( n >= 64 )
-    {
-        // Constants used for comparisons.
-        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
-        temp.v = _mm256_set1_ps( -0.0f );
-        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
-        thres_big_vec.v = _mm256_set1_ps( thres_big );
-        v8sf_t x0v, x1v, x2v, x3v;
-        v8sf_t y0v, y1v, y2v, y3v;
-        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
-        zerov.v  = _mm256_setzero_ps();
-
-        // Partial sums used for scaling.
-        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
-        sum_sml_vec0.v = _mm256_setzero_ps();
-        sum_sml_vec1.v = _mm256_setzero_ps();
-        sum_sml_vec2.v = _mm256_setzero_ps();
-        sum_sml_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
-        sum_med_vec0.v = _mm256_setzero_ps();
-        sum_med_vec1.v = _mm256_setzero_ps();
-        sum_med_vec2.v = _mm256_setzero_ps();
-        sum_med_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
-        sum_big_vec0.v = _mm256_setzero_ps();
-        sum_big_vec1.v = _mm256_setzero_ps();
-        sum_big_vec2.v = _mm256_setzero_ps();
-        sum_big_vec3.v = _mm256_setzero_ps();
-
-        for (; ( i + 32 ) <= n; i = i + 32)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-            x1v.v = _mm256_loadu_ps( xt + 8 );
-            x2v.v = _mm256_loadu_ps( xt + 16 );
-            x3v.v = _mm256_loadu_ps( xt + 24 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-                
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {                    
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec3.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 32;
-        }
-
-        for (; ( i + 24 ) <= n; i = i + 24)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-            x1v.v = _mm256_loadu_ps( xt + 8 );
-            x2v.v = _mm256_loadu_ps( xt + 16 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            
-            xt += 24;
-        }
-
-        for (; ( i + 16 ) <= n; i = i + 16)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-            x1v.v = _mm256_loadu_ps( xt + 8 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 16;
-        }
-        
-        // This seems to not be improving performance.
-        #if 0
-        for (; ( i + 8 ) <= n; i = i + 8)
-        {
-            x0v.v = _mm256_loadu_ps( xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 8;
-        }
-        #endif
-
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
-        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
-
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
-        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
-        sum_med = horizontal_add_sf(sum_med_vec0.v);
-
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
-        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
-        sum_big = horizontal_add_sf(sum_big_vec0.v);
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            abs_chi = bli_fabs( *xt );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-            xt++;
-        }
-    }
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {
-        
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-    
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0f )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0f / scale_big;
-        sumsq = sum_big;
-    }
-    else if ( sum_sml > 0.0f )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0f )
-        {
-            sum_med = sqrtf( sum_med );
-            sum_sml = sqrtf( sum_sml ) / scale_sml;
-            float ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0f;
-            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0f / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0f;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrtf( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_scnorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       scomplex*   x, inc_t incx,
-       float* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    float sumsq = 0.0f;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    scomplex  *x_buf = x;
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" scomplex elements in vector x.
-        size_t buffer_size = n * sizeof( scomplex );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(scomplex)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    scomplex *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    float sum_sml = 0.0f;
-   // Accumulator for medium values; no scaling required.
-    float sum_med = 0.0f;
-    // Accumulator for big values; using scaling to avoid overflow.
-    float sum_big = 0.0f;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
-    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
-    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
-    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
-
-    float scale = 1.0f;
-    float abs_chi;
-    bool isbig = false;
-
-    if ( n >= 64 )
-    {
-        // Constants used for comparisons.
-        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
-        temp.v = _mm256_set1_ps( -0.0f );
-        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
-        thres_big_vec.v = _mm256_set1_ps( thres_big );
-        v8sf_t x0v, x1v, x2v, x3v;
-        v8sf_t y0v, y1v, y2v, y3v;
-        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
-        zerov.v  = _mm256_setzero_ps();
-
-        // Partial sums used for scaling.
-        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
-        sum_sml_vec0.v = _mm256_setzero_ps();
-        sum_sml_vec1.v = _mm256_setzero_ps();
-        sum_sml_vec2.v = _mm256_setzero_ps();
-        sum_sml_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
-        sum_med_vec0.v = _mm256_setzero_ps();
-        sum_med_vec1.v = _mm256_setzero_ps();
-        sum_med_vec2.v = _mm256_setzero_ps();
-        sum_med_vec3.v = _mm256_setzero_ps();
-
-        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
-        sum_big_vec0.v = _mm256_setzero_ps();
-        sum_big_vec1.v = _mm256_setzero_ps();
-        sum_big_vec2.v = _mm256_setzero_ps();
-        sum_big_vec3.v = _mm256_setzero_ps();
-
-        for (; ( i + 16 ) <= n; i = i + 16)
-        {
-            x0v.v = _mm256_loadu_ps( (float*) xt );
-            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
-            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
-            x3v.v = _mm256_loadu_ps( (float*) (xt + 12) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-                
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {                    
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec3.v ) )
-                {
-                    isbig = true;
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
-                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
-                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
-                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 16;
-        }
-
-        for (; ( i + 12 ) <= n; i = i + 12)
-        {
-            x0v.v = _mm256_loadu_ps( (float*)xt );
-            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
-            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec2.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
-                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
-                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
-                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            
-            xt += 12;
-        }
-        
-        for (; ( i + 8 ) <= n; i = i + 8)
-        {
-            x0v.v = _mm256_loadu_ps( (float*)xt );
-            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
-                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
-                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 8;
-        }
-        // This seems to not be improving performance.
-        #if 0
-        for (; ( i + 4 ) <= n; i = i + 4)
-        {
-            x0v.v = _mm256_loadu_ps( (float*)xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_sf( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_ps( scale_big );
-                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_ps( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_ps( scale_sml );
-                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
-                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
-                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_ps( -0.0 );
-                    }
-                }
-            }
-            xt += 4;
-        }
-        #endif
-
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
-        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
-        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
-
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
-        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
-        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
-        sum_med = horizontal_add_sf(sum_med_vec0.v);
-
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
-        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
-        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
-        sum_big = horizontal_add_sf(sum_big_vec0.v);
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    double chi_r, chi_i;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            // Get real and imaginary component of the vector element.            
-            bli_csgets(*xt, chi_r, chi_i);
-            // Start with accumulating the real component of the vector element.
-            abs_chi = bli_fabs( chi_r );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-            // Accumulate the imaginary component of the vector element.
-            abs_chi = bli_fabs( chi_i );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            xt++;
-        }
-    }
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-    
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0f )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0f / scale_big;
-        sumsq = sum_big;
-    }
-    else if ( sum_sml > 0.0f )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0f )
-        {
-            sum_med = sqrtf( sum_med );
-            sum_sml = sqrtf( sum_sml ) / scale_sml;
-            float ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0f;
-            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0f / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0f;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrtf( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_dnorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       double*   x, inc_t incx,
-       double* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    double sumsq = 0;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    double  *x_buf = x;
-
-    // Early return if n<=0 or incx=0
-    if ( ( n <= 0) || ( incx == 0 ) )
-    {
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" double elements in vector x.
-        size_t buffer_size = n * sizeof( double );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(double)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    double *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    double sum_sml = 0;
-   // Accumulator for medium values; no scaling required.
-    double sum_med = 0;
-    // Accumulator for big values; using scaling to avoid overflow.
-    double sum_big = 0;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
-    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
-    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
-    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
-
-    double scale;
-    double abs_chi;
-    bool isbig = false;
-
-    if ( n > 4 )
-    {
-        // Constants used for comparisons.
-        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
-        temp.v = _mm256_set1_pd( -0.0 );
-        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
-        thres_big_vec.v = _mm256_set1_pd( thres_big );
-        v4df_t x0v, x1v, mask_vec0, mask_vec1;
-        zerov.v  = _mm256_setzero_pd();
-
-        // Partial sums used for scaling.
-        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
-        sum_med_vec0.v = _mm256_setzero_pd();
-        sum_big_vec0.v = _mm256_setzero_pd();
-        sum_sml_vec0.v = _mm256_setzero_pd();
-        sum_med_vec1.v = _mm256_setzero_pd();
-        sum_big_vec1.v = _mm256_setzero_pd();
-        sum_sml_vec1.v = _mm256_setzero_pd();
-
-        for (; ( i + 8 ) <= n; i = i + 8)
-        {
-            x0v.v = _mm256_loadu_pd( xt );
-            x1v.v = _mm256_loadu_pd( xt + 4 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); 
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); 
-                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            xt += 8;
-        }
-
-        for ( ; ( i + 4 ) <= n; i = i + 4 )
-        {
-            x0v.v = _mm256_loadu_pd( xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-            xt += 4;
-        }
-
-        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
-        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
-
-        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
-                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
-        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
-                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
-        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
-                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            abs_chi = bli_fabs( *xt );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-            xt++;
-        }
-    }
-
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {        
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0 )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0 / scale_big;
-        sumsq = sum_big;
-    }
-
-    else if ( sum_sml > 0.0 )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0 )
-        {
-            sum_med = sqrt( sum_med );
-            sum_sml = sqrt( sum_sml ) / scale_sml;
-            double ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0;
-            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0 / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrt( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_dznorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       dcomplex*   x, inc_t incx,
-       double* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    double sumsq = 0;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    dcomplex  *x_buf = x;
-
-    // Early return if n<=0 or incx=0
-    if ( ( n <= 0) || ( incx == 0 ) )
-    {
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" dcomplex elements in vector x.
-        size_t buffer_size = n * sizeof( dcomplex );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dznorm2fv_unb_var1(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(dcomplex)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    dcomplex *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    double sum_sml = 0;
-   // Accumulator for medium values; no scaling required.
-    double sum_med = 0;
-    // Accumulator for big values; using scaling to avoid overflow.
-    double sum_big = 0;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
-    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
-    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
-    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
-
-    double scale;
-    double abs_chi;
-    bool isbig = false;
-
-    if ( n > 2 )
-    {
-        // Constants used for comparisons.
-        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
-        temp.v = _mm256_set1_pd( -0.0 );
-        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
-        thres_big_vec.v = _mm256_set1_pd( thres_big );
-        v4df_t x0v, x1v, mask_vec0, mask_vec1;
-        zerov.v  = _mm256_setzero_pd();
-
-        // Partial sums used for scaling.
-        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
-        sum_med_vec0.v = _mm256_setzero_pd();
-        sum_big_vec0.v = _mm256_setzero_pd();
-        sum_sml_vec0.v = _mm256_setzero_pd();
-        sum_med_vec1.v = _mm256_setzero_pd();
-        sum_big_vec1.v = _mm256_setzero_pd();
-        sum_sml_vec1.v = _mm256_setzero_pd();
-
-        for (; ( i + 4 ) <= n; i = i + 4)
-        {
-            x0v.v = _mm256_loadu_pd( (double*) xt );
-            x1v.v = _mm256_loadu_pd( (double*) (xt + 2) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            if ( bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or_df( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            xt += 4;
-        }
-
-        for ( ; ( i + 2 ) <= n; i = i + 2 )
-        {
-            x0v.v = _mm256_loadu_pd( (double*) xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or_df( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or_df( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-            xt += 2;
-        }
-
-        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
-        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
-
-        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
-                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
-        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
-                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
-        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
-                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    double chi_r, chi_i;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            // Get real and imaginary component of the vector element.
-            bli_zdgets(*xt, chi_r, chi_i);
-
-            // Start with accumulating the real component of the vector element.
-            abs_chi = bli_fabs( chi_r );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            // Accumulate the imaginary component of the vector element.
-            abs_chi = bli_fabs( chi_i );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-                {
-                    #ifdef BLIS_ENABLE_MEM_TRACING
-                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-                    #endif
-                    // Return the buffer to pool.
-                    bli_membrk_release( &rntm , &mem_bufX );
-                }
-
-                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            xt++;
-        }
-    }
-
-    // Early return if there is an Inf.
-    if ( hasInf ) 
-    {        
-        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-        {
-            #ifdef BLIS_ENABLE_MEM_TRACING
-                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
-            #endif
-            // Return the buffer to pool.
-            bli_membrk_release( &rntm , &mem_bufX );
-        }
-
-        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-        return;
-    }
-
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0 )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0 / scale_big;
-        sumsq = sum_big;
-    }
-
-    else if ( sum_sml > 0.0 )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0 )
-        {
-            sum_med = sqrt( sum_med );
-            sum_sml = sqrt( sum_sml ) / scale_sml;
-            double ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0;
-            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0 / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrt( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dznorm2fv_unb_var1(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "immintrin.h"
+#include "blis.h"
+
+// Union data structure to access AVX registers
+// One 256-bit AVX register holds 8 SP elements. 
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__( ( aligned( 64 ) ) );
+} v8sf_t;
+
+// Union data structure to access AVX registers
+// One 256-bit AVX register holds 4 DP elements. 
+typedef union
+{
+    __m256d v;
+    double  d[4] __attribute__( ( aligned( 64 ) ) );
+} v4df_t;
+
+// Return a mask which indicates either:
+// v <= t or v >= T
+#define CMP256_sf( v, t, T ) \
+	_mm256_or_ps( _mm256_cmp_ps( v, t, _CMP_LE_OS ), _mm256_cmp_ps( v, T, _CMP_GE_OS ) );
+
+#define CMP256_df( v, t, T ) \
+	_mm256_or_pd( _mm256_cmp_pd( v, t, _CMP_LE_OS ), _mm256_cmp_pd( v, T, _CMP_GE_OS ) );
+
+// Returns true if any of the values in the mask vector a is true, 
+// and false, otherwise.
+// In more detail, __mm256_testz_ps() performs the bitwise (a AND b) operation and returns:
+//    1 if the sign bit of all bitwise operations is 0,
+//    0 if at least one of the sign bits of each bitwise operation is 1.
+// The sign bit of (a AND a) will be 1 iff the sign bit of a is 1, and 0 otherwise.
+// That means that __mm256_testz_ps(a,a) returns:
+//    1 if the sign bit of all elements in a is 0,
+//    0 if at least one of the sign bits of a is 1.
+// Because of the negation, bli_horizontal_or_sf() returns:
+//    0 if the sign bit of all elements in a is 0,
+//    1 if at least one of the sign bits of a is 1. 
+// Since a is the result of a masking operation, bli_horizontal_or_sf() returns:
+//    0 (false) if the mask is false for all elements in a,
+//    1 (true)  if the mask is true for at least one element in a.
+static inline bool bli_horizontal_or_sf( __m256  a ) { return ! _mm256_testz_ps( a, a ); }
+static inline bool bli_horizontal_or_df( __m256d a ) { return ! _mm256_testz_pd( a, a ); }
+
+float horizontal_add_sf(__m256 const a) {
+    __m256 t1 = _mm256_hadd_ps(a, a);
+    __m256 t2 = _mm256_hadd_ps(t1,t1);
+    __m128 t3 = _mm256_extractf128_ps(t2,1);
+    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
+    return _mm_cvtss_f32(t4); // sign extend to 32 bits
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_snorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       float*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+    
+    float sumsq = 0.0f;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    float  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" float elements in vector x.
+        size_t buffer_size = n * sizeof( float );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(float)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    float *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    float sum_sml = 0.0f;
+    // Accumulator for medium values; no scaling required.
+    float sum_med = 0.0f;
+    // Accumulator for big values; using scaling to avoid overflow.
+    float sum_big = 0.0f;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
+    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
+    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
+    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
+
+    float scale = 1.0f;
+    float abs_chi;
+    bool isbig = false;
+
+    if ( n >= 64 )
+    {
+        // Constants used for comparisons.
+        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
+        temp.v = _mm256_set1_ps( -0.0f );
+        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
+        thres_big_vec.v = _mm256_set1_ps( thres_big );
+        v8sf_t x0v, x1v, x2v, x3v;
+        v8sf_t y0v, y1v, y2v, y3v;
+        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
+        zerov.v  = _mm256_setzero_ps();
+
+        // Partial sums used for scaling.
+        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
+        sum_sml_vec0.v = _mm256_setzero_ps();
+        sum_sml_vec1.v = _mm256_setzero_ps();
+        sum_sml_vec2.v = _mm256_setzero_ps();
+        sum_sml_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
+        sum_med_vec0.v = _mm256_setzero_ps();
+        sum_med_vec1.v = _mm256_setzero_ps();
+        sum_med_vec2.v = _mm256_setzero_ps();
+        sum_med_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
+        sum_big_vec0.v = _mm256_setzero_ps();
+        sum_big_vec1.v = _mm256_setzero_ps();
+        sum_big_vec2.v = _mm256_setzero_ps();
+        sum_big_vec3.v = _mm256_setzero_ps();
+
+        for (; ( i + 32 ) <= n; i = i + 32)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+            x2v.v = _mm256_loadu_ps( xt + 16 );
+            x3v.v = _mm256_loadu_ps( xt + 24 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+                
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {                    
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec3.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 32;
+        }
+
+        for (; ( i + 24 ) <= n; i = i + 24)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+            x2v.v = _mm256_loadu_ps( xt + 16 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            
+            xt += 24;
+        }
+
+        for (; ( i + 16 ) <= n; i = i + 16)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 16;
+        }
+        
+        // This seems to not be improving performance.
+        #if 0
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 8;
+        }
+        #endif
+
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
+        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
+
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
+        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
+        sum_med = horizontal_add_sf(sum_med_vec0.v);
+
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
+        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
+        sum_big = horizontal_add_sf(sum_big_vec0.v);
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            abs_chi = bli_fabs( *xt );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            xt++;
+        }
+    }
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {
+        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+    
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0f )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0f / scale_big;
+        sumsq = sum_big;
+    }
+    else if ( sum_sml > 0.0f )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0f )
+        {
+            sum_med = sqrtf( sum_med );
+            sum_sml = sqrtf( sum_sml ) / scale_sml;
+            float ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0f;
+            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0f / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0f;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrtf( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_scnorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       scomplex*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    float sumsq = 0.0f;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    scomplex  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" scomplex elements in vector x.
+        size_t buffer_size = n * sizeof( scomplex );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(scomplex)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    scomplex *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    float sum_sml = 0.0f;
+   // Accumulator for medium values; no scaling required.
+    float sum_med = 0.0f;
+    // Accumulator for big values; using scaling to avoid overflow.
+    float sum_big = 0.0f;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
+    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
+    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
+    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
+
+    float scale = 1.0f;
+    float abs_chi;
+    bool isbig = false;
+
+    if ( n >= 64 )
+    {
+        // Constants used for comparisons.
+        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
+        temp.v = _mm256_set1_ps( -0.0f );
+        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
+        thres_big_vec.v = _mm256_set1_ps( thres_big );
+        v8sf_t x0v, x1v, x2v, x3v;
+        v8sf_t y0v, y1v, y2v, y3v;
+        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
+        zerov.v  = _mm256_setzero_ps();
+
+        // Partial sums used for scaling.
+        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
+        sum_sml_vec0.v = _mm256_setzero_ps();
+        sum_sml_vec1.v = _mm256_setzero_ps();
+        sum_sml_vec2.v = _mm256_setzero_ps();
+        sum_sml_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
+        sum_med_vec0.v = _mm256_setzero_ps();
+        sum_med_vec1.v = _mm256_setzero_ps();
+        sum_med_vec2.v = _mm256_setzero_ps();
+        sum_med_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
+        sum_big_vec0.v = _mm256_setzero_ps();
+        sum_big_vec1.v = _mm256_setzero_ps();
+        sum_big_vec2.v = _mm256_setzero_ps();
+        sum_big_vec3.v = _mm256_setzero_ps();
+
+        for (; ( i + 16 ) <= n; i = i + 16)
+        {
+            x0v.v = _mm256_loadu_ps( (float*) xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
+            x3v.v = _mm256_loadu_ps( (float*) (xt + 12) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+                
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {                    
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec3.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 16;
+        }
+
+        for (; ( i + 12 ) <= n; i = i + 12)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            
+            xt += 12;
+        }
+        
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 8;
+        }
+        // This seems to not be improving performance.
+        #if 0
+        for (; ( i + 4 ) <= n; i = i + 4)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 4;
+        }
+        #endif
+
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
+        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
+
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
+        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
+        sum_med = horizontal_add_sf(sum_med_vec0.v);
+
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
+        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
+        sum_big = horizontal_add_sf(sum_big_vec0.v);
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    double chi_r, chi_i;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            // Get real and imaginary component of the vector element.            
+            bli_csgets(*xt, chi_r, chi_i);
+            // Start with accumulating the real component of the vector element.
+            abs_chi = bli_fabs( chi_r );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            // Accumulate the imaginary component of the vector element.
+            abs_chi = bli_fabs( chi_i );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            xt++;
+        }
+    }
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+    
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0f )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0f / scale_big;
+        sumsq = sum_big;
+    }
+    else if ( sum_sml > 0.0f )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0f )
+        {
+            sum_med = sqrtf( sum_med );
+            sum_sml = sqrtf( sum_sml ) / scale_sml;
+            float ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0f;
+            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0f / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0f;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrtf( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_dnorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       double*   x, inc_t incx,
+       double* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    double sumsq = 0;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    double  *x_buf = x;
+
+    // Early return if n<=0 or incx=0
+    if ( ( n <= 0) || ( incx == 0 ) )
+    {
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" double elements in vector x.
+        size_t buffer_size = n * sizeof( double );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(double)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    double *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    double sum_sml = 0;
+   // Accumulator for medium values; no scaling required.
+    double sum_med = 0;
+    // Accumulator for big values; using scaling to avoid overflow.
+    double sum_big = 0;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
+    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
+    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
+    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
+
+    double scale;
+    double abs_chi;
+    bool isbig = false;
+
+    if ( n > 4 )
+    {
+        // Constants used for comparisons.
+        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
+        temp.v = _mm256_set1_pd( -0.0 );
+        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
+        thres_big_vec.v = _mm256_set1_pd( thres_big );
+        v4df_t x0v, x1v, mask_vec0, mask_vec1;
+        zerov.v  = _mm256_setzero_pd();
+
+        // Partial sums used for scaling.
+        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
+        sum_med_vec0.v = _mm256_setzero_pd();
+        sum_big_vec0.v = _mm256_setzero_pd();
+        sum_sml_vec0.v = _mm256_setzero_pd();
+        sum_med_vec1.v = _mm256_setzero_pd();
+        sum_big_vec1.v = _mm256_setzero_pd();
+        sum_sml_vec1.v = _mm256_setzero_pd();
+
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_pd( xt );
+            x1v.v = _mm256_loadu_pd( xt + 4 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); 
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); 
+                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            xt += 8;
+        }
+
+        for ( ; ( i + 4 ) <= n; i = i + 4 )
+        {
+            x0v.v = _mm256_loadu_pd( xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+            xt += 4;
+        }
+
+        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
+        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
+
+        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
+                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
+        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
+                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
+        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
+                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            abs_chi = bli_fabs( *xt );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            xt++;
+        }
+    }
+
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0 )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0 / scale_big;
+        sumsq = sum_big;
+    }
+
+    else if ( sum_sml > 0.0 )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0 )
+        {
+            sum_med = sqrt( sum_med );
+            sum_sml = sqrt( sum_sml ) / scale_sml;
+            double ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0;
+            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0 / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrt( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_dznorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       dcomplex*   x, inc_t incx,
+       double* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    double sumsq = 0;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    dcomplex  *x_buf = x;
+
+    // Early return if n<=0 or incx=0
+    if ( ( n <= 0) || ( incx == 0 ) )
+    {
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" dcomplex elements in vector x.
+        size_t buffer_size = n * sizeof( dcomplex );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dznorm2fv_unb_var1(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(dcomplex)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    dcomplex *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    double sum_sml = 0;
+   // Accumulator for medium values; no scaling required.
+    double sum_med = 0;
+    // Accumulator for big values; using scaling to avoid overflow.
+    double sum_big = 0;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
+    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
+    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
+    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
+
+    double scale;
+    double abs_chi;
+    bool isbig = false;
+
+    if ( n > 2 )
+    {
+        // Constants used for comparisons.
+        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
+        temp.v = _mm256_set1_pd( -0.0 );
+        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
+        thres_big_vec.v = _mm256_set1_pd( thres_big );
+        v4df_t x0v, x1v, mask_vec0, mask_vec1;
+        zerov.v  = _mm256_setzero_pd();
+
+        // Partial sums used for scaling.
+        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
+        sum_med_vec0.v = _mm256_setzero_pd();
+        sum_big_vec0.v = _mm256_setzero_pd();
+        sum_sml_vec0.v = _mm256_setzero_pd();
+        sum_med_vec1.v = _mm256_setzero_pd();
+        sum_big_vec1.v = _mm256_setzero_pd();
+        sum_sml_vec1.v = _mm256_setzero_pd();
+
+        for (; ( i + 4 ) <= n; i = i + 4)
+        {
+            x0v.v = _mm256_loadu_pd( (double*) xt );
+            x1v.v = _mm256_loadu_pd( (double*) (xt + 2) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            xt += 4;
+        }
+
+        for ( ; ( i + 2 ) <= n; i = i + 2 )
+        {
+            x0v.v = _mm256_loadu_pd( (double*) xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+            xt += 2;
+        }
+
+        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
+        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
+
+        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
+                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
+        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
+                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
+        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
+                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    double chi_r, chi_i;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            // Get real and imaginary component of the vector element.
+            bli_zdgets(*xt, chi_r, chi_i);
+
+            // Start with accumulating the real component of the vector element.
+            abs_chi = bli_fabs( chi_r );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            // Accumulate the imaginary component of the vector element.
+            abs_chi = bli_fabs( chi_i );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            xt++;
+        }
+    }
+
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0 )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0 / scale_big;
+        sumsq = sum_big;
+    }
+
+    else if ( sum_sml > 0.0 )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0 )
+        {
+            sum_med = sqrt( sum_med );
+            sum_sml = sqrt( sum_sml ) / scale_sml;
+            double ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0;
+            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0 / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrt( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dznorm2fv_unb_var1(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}