From b43dae9a5d2f078c9bbe07079031d6c00a68b7de Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 1 Dec 2020 16:44:38 -0600 Subject: [PATCH] Fixed copy-paste bugs in edge-case sup kernels. Details: - Fixed bugs in two sup kernels, bli_dgemmsup_rv_haswell_asm_1x6() and bli_dgemmsup_rd_haswell_asm_1x4(), which involved extraneous assembly instructions that were left over from when the kernels were first written. These instructions would cause segmentation faults in some situations where extra memory was not allocated beyond the end of the matrix buffers. Thanks to Kiran Varaganti for reporting these bugs and to Bhaskar Nallani for identifying the cause and solution. --- kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c | 1 - kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c index a3b56cb12..4c6094b1c 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c @@ -1297,7 +1297,6 @@ void bli_dgemmsup_rd_haswell_asm_1x4 // which would destory intermediate results. vmovsd(mem(rax ), xmm0) - vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c index 8022bf065..9da1e7b83 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c @@ -2828,7 +2828,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) @@ -2845,7 +2844,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) @@ -2862,7 +2860,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) @@ -2879,7 +2876,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) @@ -2914,7 +2910,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) - vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5)