diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c index 96bc92749..c309c8c0c 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c +++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c @@ -3,7 +3,7 @@ An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2022 , Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -516,7 +516,8 @@ void bli_sgemmsup_rd_zen_asm_1x16 je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) - vfmadd231ps(mem(rcx), ymm3, ymm4) + vmovups(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c index 752a0a01c..507ff5a71 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -8048,15 +8048,18 @@ void bli_sgemmsup_rv_zen_asm_3x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx), xmm3, xmm4) + vmovsd(mem(rcx), xmm0)////a0a1 + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm6) + vmovsd(mem(rcx), xmm0)////a0a1 + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm8) + vmovsd(mem(rcx), xmm0)////a0a1 + vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) jmp(.SDONE) // jump to end. @@ -8329,11 +8332,13 @@ void bli_sgemmsup_rv_zen_asm_2x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx), xmm3, xmm4) + vmovsd(mem(rcx), xmm0)////a0a1 + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm6) + vmovsd(mem(rcx), xmm0)////a0a1 + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. @@ -8577,7 +8582,8 @@ void bli_sgemmsup_rv_zen_asm_1x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c index 41dbbd699..e6ecd47f4 100644 --- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c +++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2231,22 +2231,28 @@ void bli_sgemmsup_rv_zen_asm_6x2m label(.SROWSTORED) - vfmadd231ps(mem(rcx), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovlpd(xmm4, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm6) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) vmovlpd(xmm6, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm8) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) vmovlpd(xmm8, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm10) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm10) vmovlpd(xmm10, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm12) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm12) vmovlpd(xmm12, mem(rcx)) add(rdi, rcx) - vfmadd231ps(mem(rcx), xmm3, xmm14) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm14) vmovlpd(xmm14, mem(rcx)) jmp(.SDONE) // jump to end.