Fixed copy-paste bugs in edge-case sup kernels.

Details:
- Fixed bugs in two sup kernels, bli_dgemmsup_rv_haswell_asm_1x6() and
  bli_dgemmsup_rd_haswell_asm_1x4(), which involved extraneous assembly
  instructions that were left over from when the kernels were first
  written. These instructions would cause segmentation faults in some
  situations where extra memory was not allocated beyond the end of
  the matrix buffers. Thanks to Kiran Varaganti for reporting these
  bugs and to Bhaskar Nallani for identifying the cause and solution.
This commit is contained in:
Field G. Van Zee
2020-12-01 16:44:38 -06:00
parent 6d3bafacd7
commit b43dae9a5d
2 changed files with 0 additions and 6 deletions

View File

@@ -1297,7 +1297,6 @@ void bli_dgemmsup_rd_haswell_asm_1x4
// which would destory intermediate results.
vmovsd(mem(rax ), xmm0)
vmovsd(mem(rax, r8, 1), xmm1)
add(imm(1*8), rax) // a += 1*cs_a = 1*8;
vmovsd(mem(rbx ), xmm3)

View File

@@ -2828,7 +2828,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6
add(r10, rbx) // b += rs_b;
vbroadcastsd(mem(rax ), ymm2)
vbroadcastsd(mem(rax, r8, 1), ymm3)
add(r9, rax) // a += cs_a;
vfmadd231pd(ymm0, ymm2, ymm4)
vfmadd231pd(ymm1, ymm2, ymm5)
@@ -2845,7 +2844,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6
add(r10, rbx) // b += rs_b;
vbroadcastsd(mem(rax ), ymm2)
vbroadcastsd(mem(rax, r8, 1), ymm3)
add(r9, rax) // a += cs_a;
vfmadd231pd(ymm0, ymm2, ymm4)
vfmadd231pd(ymm1, ymm2, ymm5)
@@ -2862,7 +2860,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6
add(r10, rbx) // b += rs_b;
vbroadcastsd(mem(rax ), ymm2)
vbroadcastsd(mem(rax, r8, 1), ymm3)
add(r9, rax) // a += cs_a;
vfmadd231pd(ymm0, ymm2, ymm4)
vfmadd231pd(ymm1, ymm2, ymm5)
@@ -2879,7 +2876,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6
add(r10, rbx) // b += rs_b;
vbroadcastsd(mem(rax ), ymm2)
vbroadcastsd(mem(rax, r8, 1), ymm3)
add(r9, rax) // a += cs_a;
vfmadd231pd(ymm0, ymm2, ymm4)
vfmadd231pd(ymm1, ymm2, ymm5)
@@ -2914,7 +2910,6 @@ void bli_dgemmsup_rv_haswell_asm_1x6
add(r10, rbx) // b += rs_b;
vbroadcastsd(mem(rax ), ymm2)
vbroadcastsd(mem(rax, r8, 1), ymm3)
add(r9, rax) // a += cs_a;
vfmadd231pd(ymm0, ymm2, ymm4)
vfmadd231pd(ymm1, ymm2, ymm5)