Improved fringe case handling for AVXPV kernel (#229)

- Current kernel uses masked AVX512 instructions to handle fringe cases. - These instructions are slow on genoa. - To handle sizes less than 8, AVX2 and SSE code has been added. - Existing masked AVX512 code is performing better when n > 8 therefore it is still kept for handling larger sizes where n % 8 != 0. AMD-Internal: [CPUPL-7467] (cherry picked from commit 1dca574a9d)
2026-04-19 23:28:52 +00:00 · 2025-10-13 11:05:30 +05:30
parent 665ad619f0
commit d9d86bbf46
1 changed files with 43 additions and 0 deletions
--- a/kernels/zen4/1/bli_axpyv_zen4_int.c
+++ b/kernels/zen4/1/bli_axpyv_zen4_int.c
@@ -302,6 +302,49 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen4_int

    if (incx == 1 && incy == 1)
    {
+        if ( n < 8 )
+        {
+            // At this point we are sure that N is not more than 7
+            // If N is at least 4, use a full AVX2 FMA
+            if ( ( i + 3 ) < n )
+            {
+                __m256d y_vec = _mm256_loadu_pd(y0);
+                y_vec = _mm256_fmadd_pd(_mm256_loadu_pd(x0), _mm256_set1_pd(*alpha), y_vec);
+                _mm256_storeu_pd(y0, y_vec);
+
+                x0 += 4;
+                y0 += 4;
+                i += 4;
+            }
+
+            // At this point remainder is not more than 3
+            // If remainder is at least 2, we use full SSE FMA
+            if ( ( i + 1 ) < n )
+            {
+                __m128d y_vec = _mm_loadu_pd(y0);
+                y_vec = _mm_fmadd_pd(_mm_loadu_pd(x0), _mm_set1_pd(*alpha), y_vec);
+                _mm_storeu_pd(y0, y_vec);
+
+                x0 += 2;
+                y0 += 2;
+                i += 2;
+            }
+
+            // At this point remainder is either 0 or 1
+            // If remainder is 1, we use SSE FMA
+            // Note: Using C code instead of SSE results in MUL+ADD which is
+            // bad for accuracy.
+            if ( i < n )
+            {
+                __m128d y_vec = _mm_load1_pd(y0);
+                y_vec = _mm_fmadd_pd(_mm_load1_pd(x0), _mm_set1_pd(*alpha), y_vec);
+                _mm_storel_pd(y0, y_vec);
+            }
+
+            AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
+            return;
+        }
+        
        __m512d xv[8], yv[8], alphav;

        // Broadcast the alpha scalar to all elements of a vector register.