From 9f52a587dee855daa73c194e41b6951416544e9a Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 3 Aug 2016 16:03:53 -0500 Subject: [PATCH] Try prefetchw[t1] instead of regular prefetch for C. --- kernels/x86_64/knl/3/bli_avx512_macros.h | 2 + kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c | 98 +++++++++++------------ 2 files changed, 51 insertions(+), 49 deletions(-) diff --git a/kernels/x86_64/knl/3/bli_avx512_macros.h b/kernels/x86_64/knl/3/bli_avx512_macros.h index 7578b2502..aef869209 100644 --- a/kernels/x86_64/knl/3/bli_avx512_macros.h +++ b/kernels/x86_64/knl/3/bli_avx512_macros.h @@ -139,6 +139,8 @@ #define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0) #define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0) #define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS) +#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS) +#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS) #define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS) #define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS) #define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS) diff --git a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c index 32596a227..475820366 100644 --- a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c +++ b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c @@ -265,53 +265,53 @@ void bli_dgemm_opt_24x8( VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2)) #else - PREFETCH(1, MEM(RCX )) + PREFETCHW1(MEM(RCX )) SUBITER( 0,1,0,RAX ) - PREFETCH(1, MEM(RCX,R12,1)) + PREFETCHW1(MEM(RCX,R12,1)) SUBITER( 1,0,1,RAX ) - PREFETCH(1, MEM(RCX,R12,2)) + PREFETCHW1(MEM(RCX,R12,2)) SUBITER( 2,1,0,RAX ) - PREFETCH(1, MEM(RCX,R13,1)) + PREFETCHW1(MEM(RCX,R13,1)) SUBITER( 3,0,1,RAX ) - PREFETCH(1, MEM(RCX,R12,4)) + PREFETCHW1(MEM(RCX,R12,4)) SUBITER( 4,1,0,RAX,R8, 1) - PREFETCH(1, MEM(RCX,R14,1)) + PREFETCHW1(MEM(RCX,R14,1)) SUBITER( 5,0,1,RAX,R8, 1) - PREFETCH(1, MEM(RCX,R13,2)) + PREFETCHW1(MEM(RCX,R13,2)) SUBITER( 6,1,0,RAX,R8, 1) - PREFETCH(1, MEM(RCX,R15,1)) + PREFETCHW1(MEM(RCX,R15,1)) SUBITER( 7,0,1,RAX,R8, 1) - PREFETCH(1, MEM(RDX )) + PREFETCHW1(MEM(RDX )) SUBITER( 8,1,0,RAX,R8, 2) - PREFETCH(1, MEM(RDX,R12,1)) + PREFETCHW1(MEM(RDX,R12,1)) SUBITER( 9,0,1,RAX,R8, 2) - PREFETCH(1, MEM(RDX,R12,2)) + PREFETCHW1(MEM(RDX,R12,2)) SUBITER(10,1,0,RAX,R8, 2) - PREFETCH(1, MEM(RDX,R13,1)) + PREFETCHW1(MEM(RDX,R13,1)) SUBITER(11,0,1,RAX,R8, 2) - PREFETCH(1, MEM(RDX,R12,4)) + PREFETCHW1(MEM(RDX,R12,4)) SUBITER(12,1,0,RAX,R9, 1) - PREFETCH(1, MEM(RDX,R14,1)) + PREFETCHW1(MEM(RDX,R14,1)) SUBITER(13,0,1,RAX,R9, 1) - PREFETCH(1, MEM(RDX,R13,2)) + PREFETCHW1(MEM(RDX,R13,2)) SUBITER(14,1,0,RAX,R9, 1) - PREFETCH(1, MEM(RDX,R15,1)) + PREFETCHW1(MEM(RDX,R15,1)) SUBITER(15,0,1,RAX,R9, 1) - PREFETCH(1, MEM(RDI )) + PREFETCHW1(MEM(RDI )) SUBITER(16,1,0,RAX,R8, 4) - PREFETCH(1, MEM(RDI,R12,1)) + PREFETCHW1(MEM(RDI,R12,1)) SUBITER(17,0,1,RAX,R8, 4) - PREFETCH(1, MEM(RDI,R12,2)) + PREFETCHW1(MEM(RDI,R12,2)) SUBITER(18,1,0,RAX,R8, 4) - PREFETCH(1, MEM(RDI,R13,1)) + PREFETCHW1(MEM(RDI,R13,1)) SUBITER(19,0,1,RAX,R8, 4) - PREFETCH(1, MEM(RDI,R12,4)) + PREFETCHW1(MEM(RDI,R12,4)) SUBITER(20,1,0,RAX,R10,1) - PREFETCH(1, MEM(RDI,R14,1)) + PREFETCHW1(MEM(RDI,R14,1)) SUBITER(21,0,1,RAX,R10,1) - PREFETCH(1, MEM(RDI,R13,2)) + PREFETCHW1(MEM(RDI,R13,2)) SUBITER(22,1,0,RAX,R10,1) - PREFETCH(1, MEM(RDI,R15,1)) + PREFETCHW1(MEM(RDI,R15,1)) SUBITER(23,0,1,RAX,R10,1) ADD(RAX, IMM(24*24*8)) @@ -454,58 +454,58 @@ void bli_dgemm_opt_24x8( #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX )) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R12,1)) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RCX,R12,2)) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX )) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2)) SUBITER(0,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX,R13,1)) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R12,4)) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RCX,R14,1)) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1)) SUBITER(1,0,1,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX,R13,2)) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R15,1)) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX )) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX )) SUBITER(2,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R12,1)) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDX,R12,2)) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX,R13,1)) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1)) SUBITER(3,0,1,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R12,4)) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDX,R14,1)) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX,R13,2)) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2)) SUBITER(4,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R15,1)) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI )) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R12,1)) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI )) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1)) SUBITER(5,0,1,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDI,R12,2)) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI,R13,1)) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R12,4)) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4)) SUBITER(6,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 -#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDI,R14,1)) -#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI,R13,2)) -#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R15,1)) +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1)) SUBITER(7,0,1,RAX,R8,1) #endif @@ -518,7 +518,7 @@ void bli_dgemm_opt_24x8( LABEL(TAIL_LOOP) - PREFETCH(0, MEM(RDX)) + PREFETCHW0(MEM(RDX)) ADD(RDX, R12) SUBITER(0,1,0,RAX)