Merge remote-tracking branch 'origin/knl' into knl

# Conflicts:
#	kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c
This commit is contained in:
Devin Matthews
2016-08-03 16:09:51 -05:00
2 changed files with 51 additions and 49 deletions

View File

@@ -140,6 +140,8 @@
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)

View File

@@ -265,59 +265,59 @@ void bli_dgemm_opt_24x8(
VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
#else
PREFETCH(1, MEM(RCX ))
PREFETCHW1(MEM(RCX ))
SUBITER( 0,1,0,RAX )
PREFETCH(1, MEM(RCX,R12,1))
PREFETCHW1(MEM(RCX,R12,1))
SUBITER( 1,0,1,RAX )
PREFETCH(1, MEM(RCX,R12,2))
PREFETCHW1(MEM(RCX,R12,2))
SUBITER( 2,1,0,RAX )
PREFETCH(1, MEM(RCX,R13,1))
PREFETCHW1(MEM(RCX,R13,1))
SUBITER( 3,0,1,RAX )
PREFETCH(1, MEM(RCX,R12,4))
PREFETCHW1(MEM(RCX,R12,4))
SUBITER( 4,1,0,RAX,R8, 1)
PREFETCH(1, MEM(RCX,R14,1))
PREFETCHW1(MEM(RCX,R14,1))
SUBITER( 5,0,1,RAX,R8, 1)
PREFETCH(1, MEM(RCX,R13,2))
PREFETCHW1(MEM(RCX,R13,2))
SUBITER( 6,1,0,RAX,R8, 1)
PREFETCH(1, MEM(RCX,R15,1))
PREFETCHW1(MEM(RCX,R15,1))
SUBITER( 7,0,1,RAX,R8, 1)
LEA(RDX, MEM(RCX,R12,8))
PREFETCH(1, MEM(RDX ))
PREFETCHW1(MEM(RDX ))
SUBITER( 8,1,0,RAX,R8, 2)
PREFETCH(1, MEM(RDX,R12,1))
PREFETCHW1(MEM(RDX,R12,1))
SUBITER( 9,0,1,RAX,R8, 2)
PREFETCH(1, MEM(RDX,R12,2))
PREFETCHW1(MEM(RDX,R12,2))
SUBITER(10,1,0,RAX,R8, 2)
PREFETCH(1, MEM(RDX,R13,1))
PREFETCHW1(MEM(RDX,R13,1))
SUBITER(11,0,1,RAX,R8, 2)
PREFETCH(1, MEM(RDX,R12,4))
PREFETCHW1(MEM(RDX,R12,4))
SUBITER(12,1,0,RAX,R9, 1)
PREFETCH(1, MEM(RDX,R14,1))
PREFETCHW1(MEM(RDX,R14,1))
SUBITER(13,0,1,RAX,R9, 1)
PREFETCH(1, MEM(RDX,R13,2))
PREFETCHW1(MEM(RDX,R13,2))
SUBITER(14,1,0,RAX,R9, 1)
PREFETCH(1, MEM(RDX,R15,1))
PREFETCHW1(MEM(RDX,R15,1))
SUBITER(15,0,1,RAX,R9, 1)
LEA(RDX, MEM(RDX,R12,8))
PREFETCH(1, MEM(RDI ))
PREFETCHW1(MEM(RDI ))
SUBITER(16,1,0,RAX,R8, 4)
PREFETCH(1, MEM(RDI,R12,1))
PREFETCHW1(MEM(RDI,R12,1))
SUBITER(17,0,1,RAX,R8, 4)
PREFETCH(1, MEM(RDI,R12,2))
PREFETCHW1(MEM(RDI,R12,2))
SUBITER(18,1,0,RAX,R8, 4)
PREFETCH(1, MEM(RDI,R13,1))
PREFETCHW1(MEM(RDI,R13,1))
SUBITER(19,0,1,RAX,R8, 4)
PREFETCH(1, MEM(RDI,R12,4))
PREFETCHW1(MEM(RDI,R12,4))
SUBITER(20,1,0,RAX,R10,1)
PREFETCH(1, MEM(RDI,R14,1))
PREFETCHW1(MEM(RDI,R14,1))
SUBITER(21,0,1,RAX,R10,1)
PREFETCH(1, MEM(RDI,R13,2))
PREFETCHW1(MEM(RDI,R13,2))
SUBITER(22,1,0,RAX,R10,1)
PREFETCH(1, MEM(RDI,R15,1))
PREFETCHW1(MEM(RDI,R15,1))
SUBITER(23,0,1,RAX,R10,1)
ADD(RAX, IMM(24*24*8))
@@ -464,58 +464,58 @@ void bli_dgemm_opt_24x8(
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX ))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R12,1))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RCX,R12,2))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX ))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2))
SUBITER(0,1,0,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX,R13,1))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R12,4))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RCX,R14,1))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1))
SUBITER(1,0,1,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX,R13,2))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R15,1))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX ))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX ))
SUBITER(2,1,0,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R12,1))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDX,R12,2))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX,R13,1))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1))
SUBITER(3,0,1,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R12,4))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDX,R14,1))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX,R13,2))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2))
SUBITER(4,1,0,RAX,R8,1)
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R15,1))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI ))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R12,1))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI ))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1))
SUBITER(5,0,1,RAX,R8,1)
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDI,R12,2))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI,R13,1))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R12,4))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4))
SUBITER(6,1,0,RAX,R8,1)
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDI,R14,1))
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI,R13,2))
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R15,1))
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1))
SUBITER(7,0,1,RAX,R8,1)
#endif
@@ -528,7 +528,7 @@ void bli_dgemm_opt_24x8(
LABEL(TAIL_LOOP)
PREFETCH(0, MEM(RDX))
PREFETCHW0(MEM(RDX))
ADD(RDX, R12)
SUBITER(0,1,0,RAX)