Fix GCC 12+ instruction scheduling issue in complex scalv kernel (#149)

Replace fused multiply-add (FMA) intrinsics with explicit multiply and add/subtract operations in bli_cscalv_zen_int to resolve incorrect results with GCC 12 and later compilers.

The original code used register reuse pattern with _mm256_fmaddsub_ps() that causes GCC 12+ instruction scheduler to generate assembly with corrupted intermediate values due to register allocation conflicts. GCC 11 and earlier handled the same pattern correctly.

Changes:
- Replace _mm256_fmaddsub_ps() with _mm256_mul_ps() + _mm256_addsub_ps()
- Eliminate temp register reuse to fix instruction scheduling conflicts

AMD-Internal: [CPUPL-6445]
This commit is contained in:
S, Hari Govind
2025-08-22 14:23:43 +05:30
committed by GitHub
parent 36c37585de
commit d29f3f0b5e

View File

@@ -924,25 +924,35 @@ void bli_cscalv_zen_int
x_vec_ymm[2] = _mm256_loadu_ps(x0 + 2 * n_elem_per_reg);
x_vec_ymm[3] = _mm256_loadu_ps(x0 + 3 * n_elem_per_reg);
// Compute x * alpha_imag for all vectors
temp_ymm[0] = _mm256_mul_ps(x_vec_ymm[0], alpha_imag_ymm);
temp_ymm[1] = _mm256_mul_ps(x_vec_ymm[1], alpha_imag_ymm);
temp_ymm[2] = _mm256_mul_ps(x_vec_ymm[2], alpha_imag_ymm);
temp_ymm[3] = _mm256_mul_ps(x_vec_ymm[3], alpha_imag_ymm);
// Permute the imaginary results
temp_ymm[4] = _mm256_permute_ps(temp_ymm[0], 0xB1);
temp_ymm[5] = _mm256_permute_ps(temp_ymm[1], 0xB1);
temp_ymm[6] = _mm256_permute_ps(temp_ymm[2], 0xB1);
temp_ymm[7] = _mm256_permute_ps(temp_ymm[3], 0xB1);
temp_ymm[0] = _mm256_fmaddsub_ps(x_vec_ymm[0], alpha_real_ymm, temp_ymm[4]);
temp_ymm[1] = _mm256_fmaddsub_ps(x_vec_ymm[1], alpha_real_ymm, temp_ymm[5]);
temp_ymm[2] = _mm256_fmaddsub_ps(x_vec_ymm[2], alpha_real_ymm, temp_ymm[6]);
temp_ymm[3] = _mm256_fmaddsub_ps(x_vec_ymm[3], alpha_real_ymm, temp_ymm[7]);
// Compute x * alpha_real first, then add/sub the permuted imaginary part
// This ensures the correct operand order for the FMA operation
temp_ymm[0] = _mm256_mul_ps(x_vec_ymm[0], alpha_real_ymm);
temp_ymm[1] = _mm256_mul_ps(x_vec_ymm[1], alpha_real_ymm);
temp_ymm[2] = _mm256_mul_ps(x_vec_ymm[2], alpha_real_ymm);
temp_ymm[3] = _mm256_mul_ps(x_vec_ymm[3], alpha_real_ymm);
_mm256_storeu_ps(x0, temp_ymm[0]);
_mm256_storeu_ps(x0 + n_elem_per_reg, temp_ymm[1]);
_mm256_storeu_ps(x0 + 2 * n_elem_per_reg, temp_ymm[2]);
_mm256_storeu_ps(x0 + 3 * n_elem_per_reg, temp_ymm[3]);
// Now add/subtract the permuted imaginary parts
x_vec_ymm[0] = _mm256_addsub_ps(temp_ymm[0], temp_ymm[4]);
x_vec_ymm[1] = _mm256_addsub_ps(temp_ymm[1], temp_ymm[5]);
x_vec_ymm[2] = _mm256_addsub_ps(temp_ymm[2], temp_ymm[6]);
x_vec_ymm[3] = _mm256_addsub_ps(temp_ymm[3], temp_ymm[7]);
_mm256_storeu_ps(x0, x_vec_ymm[0]);
_mm256_storeu_ps(x0 + n_elem_per_reg, x_vec_ymm[1]);
_mm256_storeu_ps(x0 + 2 * n_elem_per_reg, x_vec_ymm[2]);
_mm256_storeu_ps(x0 + 3 * n_elem_per_reg, x_vec_ymm[3]);
x0 += 4 * n_elem_per_reg;
}
@@ -952,17 +962,24 @@ void bli_cscalv_zen_int
x_vec_ymm[0] = _mm256_loadu_ps(x0);
x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
// Compute x * alpha_imag for both vectors
temp_ymm[0] = _mm256_mul_ps(x_vec_ymm[0], alpha_imag_ymm);
temp_ymm[1] = _mm256_mul_ps(x_vec_ymm[1], alpha_imag_ymm);
// Permute the imaginary results
temp_ymm[2] = _mm256_permute_ps(temp_ymm[0], 0xB1);
temp_ymm[3] = _mm256_permute_ps(temp_ymm[1], 0xB1);
temp_ymm[0] = _mm256_fmaddsub_ps(x_vec_ymm[0], alpha_real_ymm, temp_ymm[2]);
temp_ymm[1] = _mm256_fmaddsub_ps(x_vec_ymm[1], alpha_real_ymm, temp_ymm[3]);
// Compute x * alpha_real first, then add/sub the permuted imaginary part
temp_ymm[0] = _mm256_mul_ps(x_vec_ymm[0], alpha_real_ymm);
temp_ymm[1] = _mm256_mul_ps(x_vec_ymm[1], alpha_real_ymm);
_mm256_storeu_ps(x0, temp_ymm[0]);
_mm256_storeu_ps(x0 + n_elem_per_reg, temp_ymm[1]);
// Now add/subtract the permuted imaginary parts
x_vec_ymm[0] = _mm256_addsub_ps(temp_ymm[0], temp_ymm[2]);
x_vec_ymm[1] = _mm256_addsub_ps(temp_ymm[1], temp_ymm[3]);
_mm256_storeu_ps(x0, x_vec_ymm[0]);
_mm256_storeu_ps(x0 + n_elem_per_reg, x_vec_ymm[1]);
x0 += 2 * n_elem_per_reg;
}
@@ -971,13 +988,19 @@ void bli_cscalv_zen_int
{
x_vec_ymm[0] = _mm256_loadu_ps(x0);
// Compute x * alpha_imag
temp_ymm[0] = _mm256_mul_ps(x_vec_ymm[0], alpha_imag_ymm);
// Permute the imaginary result
temp_ymm[1] = _mm256_permute_ps(temp_ymm[0], 0xB1);
temp_ymm[0] = _mm256_fmaddsub_ps(x_vec_ymm[0], alpha_real_ymm, temp_ymm[1]);
// Compute x * alpha_real first, then add/sub the permuted imaginary part
temp_ymm[0] = _mm256_mul_ps(x_vec_ymm[0], alpha_real_ymm);
_mm256_storeu_ps(x0, temp_ymm[0]);
// Now add/subtract the permuted imaginary part
x_vec_ymm[0] = _mm256_addsub_ps(temp_ymm[0], temp_ymm[1]);
_mm256_storeu_ps(x0, x_vec_ymm[0]);
x0 += n_elem_per_reg;
}