From dafca7a0c2c72aaf15cb588b2bef6f246abb1905 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 25 Jun 2018 16:20:10 -0500 Subject: [PATCH] Fix botched memory addressing in Penryn kernel (no effect for GAS output). --- kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c | 10 +++++----- kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c index 84a998019..7a09454ab 100644 --- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c @@ -109,7 +109,7 @@ void bli_sgemm_penryn_asm_8x4 label(.SLOOPKITER) // MAIN LOOP - prefetch(0, mem(4*35+1)*8(rax)) + prefetch(0, mem(rax, (4*35+1)*8)) addps(xmm6, xmm10) // iteration 0 addps(xmm3, xmm14) @@ -917,8 +917,8 @@ void bli_dgemm_penryn_asm_4x4 label(.DLOOPKITER) // MAIN LOOP - prefetch(0, mem(4*35+1)*8(rax)) - //prefetch(0, mem(8*97+4)*8(rax)) + prefetch(0, mem(rax, (4*35+1)*8)) + //prefetch(0, mem(rax, (8*97+4)*8)) //prefetch(0, mem(r11, 67*4*8)) // prefetch a_next[0] @@ -985,8 +985,8 @@ void bli_dgemm_penryn_asm_4x4 movaps(mem(rax, -3*16), xmm1) - prefetch(0, mem(4*37+1)*8(rax)) - //prefetch(0, mem(8*97+12)*8(rax)) + prefetch(0, mem(rax, (4*37+1)*8)) + //prefetch(0, mem(rax, (8*97+12)*8)) //prefetch(0, mem(r11, 69*4*8)) // prefetch a_next[8] //sub(imm(-4*4*8), r11) // a_next += 4*4 (unroll x mr) diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c index ac8659396..f07edbe0c 100644 --- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c @@ -125,7 +125,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 label(.LOOPKITER) // MAIN LOOP //prefetch(0, mem(rax, 1264)) - prefetch(0, mem(4*35+1)*8(rax)) + prefetch(0, mem(rax, (4*35+1)*8)) addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) @@ -189,7 +189,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rax, -3*16), xmm1) //prefetch(0, mem(rax, 1328)) - prefetch(0, mem(4*37+1)*8(rax)) + prefetch(0, mem(rax, (4*37+1)*8)) addpd(xmm3, xmm11) // iteration 2 movaps(mem(rbx, -3*16), xmm3)