Modified AVPY kernel to ensure consistency of numerical results (#188)

Current DAXPY kernel uses C code to solve cases when n %8 != 0. This results in the compiled code using MUL+ADD instruction using SSE, instead of FMA instruction. This causes inconsistency of numerical results. To fix this, AVX2 and C code is replaced with masked AVX512 instructions to compute fringe cases. AMD-Internal : [CPUPL-7315]
2026-04-20 07:38:53 +00:00 · 2025-09-18 11:42:31 +05:30
parent 740fbdf50d
commit 10b2e59782
1 changed files with 12 additions and 8 deletions
--- a/kernels/zen4/1/bli_axpyv_zen4_int.c
+++ b/kernels/zen4/1/bli_axpyv_zen4_int.c
@@ -414,19 +414,23 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen4_int
            y0 += n_elem_per_reg;
        }

-        // This loop uses AVX2 instructions
-        for (; (i + 3) < n; i += 4)
+        // compute the remainder with masked operations
+        if ( i < n )
        {
-            __m256d x_vec = _mm256_loadu_pd(x0);
+            dim_t n_remainder = ( n - i );
+            __mmask8 mask_ = ( 1 <<  n_remainder ) - 1;

-            __m256d y_vec = _mm256_loadu_pd(y0);
+            xv[0] = _mm512_maskz_loadu_pd( mask_, x0 );

-            y_vec = _mm256_fmadd_pd(x_vec, _mm256_set1_pd(*alpha), y_vec);
+            yv[0] = _mm512_maskz_loadu_pd( mask_, y0 );

-            _mm256_storeu_pd(y0, y_vec);
+            yv[0] = _mm512_fmadd_pd(xv[0], alphav, yv[0]);

-            x0 += 4;
-            y0 += 4;
+            _mm512_mask_storeu_pd( y0, mask_, yv[0] );
+
+            x0 += n_remainder;
+            y0 += n_remainder;
+            i  += n_remainder;
        }
    }