From 681eec913d7c2ebcff637cec5c1627ced9a92b99 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 12:28:09 -0500 Subject: [PATCH 1/3] Change PACKDIM_MR (double) for haswell to 8. --- config/haswell/bli_kernel.h | 1 + kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 40 ++++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index ce18dc266..9ed530d68 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -102,6 +102,7 @@ #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 +#define BLIS_PACKDIM_MR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bee1df996..3679b5773 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,22 +734,22 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -761,22 +761,22 @@ void bli_dgemm_asm_6x8 " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -786,28 +786,28 @@ void bli_dgemm_asm_6x8 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 24 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 25 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 26 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 27 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 28 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 29 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) + "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr_packdim) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" @@ -855,7 +855,7 @@ void bli_dgemm_asm_6x8 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) + "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr_packdim) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" From d87614af3f3d9187be94d6e77984b282bf890928 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 14:47:36 -0400 Subject: [PATCH 2/3] Revert "Change PACKDIM_MR (double) for haswell to 8." This reverts commit 681eec913d7c2ebcff637cec5c1627ced9a92b99. --- config/haswell/bli_kernel.h | 1 - kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 40 ++++++++++---------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 9ed530d68..ce18dc266 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -102,7 +102,6 @@ #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 -#define BLIS_PACKDIM_MR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index 3679b5773..bee1df996 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,22 +734,22 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -761,22 +761,22 @@ void bli_dgemm_asm_6x8 " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -786,28 +786,28 @@ void bli_dgemm_asm_6x8 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastsd 24 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 25 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 26 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 27 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 28 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 29 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr_packdim) + "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" @@ -855,7 +855,7 @@ void bli_dgemm_asm_6x8 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr_packdim) + "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" From 7f41bb0a0becde6a7de7df0f99668d7b4686c3b0 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 14:49:31 -0400 Subject: [PATCH 3/3] PACKDIM_MR=8 didn't work out, but messing with the prefetching helps 2%. --- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bee1df996..5bd2d92e5 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,6 +734,8 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 + "prefetcht0 72 * 8(%%rax) \n\t" + " \n\t" "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" @@ -759,7 +761,7 @@ void bli_dgemm_asm_6x8 "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" + "prefetcht0 80 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t"