mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Merge remote-tracking branch 'origin/knl' into knl
# Conflicts: # kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c
This commit is contained in:
@@ -140,6 +140,8 @@
|
||||
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
|
||||
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
|
||||
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
|
||||
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
|
||||
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
|
||||
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
|
||||
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
|
||||
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
|
||||
|
||||
@@ -265,59 +265,59 @@ void bli_dgemm_opt_24x8(
|
||||
VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
|
||||
VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
|
||||
#else
|
||||
PREFETCH(1, MEM(RCX ))
|
||||
PREFETCHW1(MEM(RCX ))
|
||||
SUBITER( 0,1,0,RAX )
|
||||
PREFETCH(1, MEM(RCX,R12,1))
|
||||
PREFETCHW1(MEM(RCX,R12,1))
|
||||
SUBITER( 1,0,1,RAX )
|
||||
PREFETCH(1, MEM(RCX,R12,2))
|
||||
PREFETCHW1(MEM(RCX,R12,2))
|
||||
SUBITER( 2,1,0,RAX )
|
||||
PREFETCH(1, MEM(RCX,R13,1))
|
||||
PREFETCHW1(MEM(RCX,R13,1))
|
||||
SUBITER( 3,0,1,RAX )
|
||||
PREFETCH(1, MEM(RCX,R12,4))
|
||||
PREFETCHW1(MEM(RCX,R12,4))
|
||||
SUBITER( 4,1,0,RAX,R8, 1)
|
||||
PREFETCH(1, MEM(RCX,R14,1))
|
||||
PREFETCHW1(MEM(RCX,R14,1))
|
||||
SUBITER( 5,0,1,RAX,R8, 1)
|
||||
PREFETCH(1, MEM(RCX,R13,2))
|
||||
PREFETCHW1(MEM(RCX,R13,2))
|
||||
SUBITER( 6,1,0,RAX,R8, 1)
|
||||
PREFETCH(1, MEM(RCX,R15,1))
|
||||
PREFETCHW1(MEM(RCX,R15,1))
|
||||
SUBITER( 7,0,1,RAX,R8, 1)
|
||||
|
||||
LEA(RDX, MEM(RCX,R12,8))
|
||||
|
||||
PREFETCH(1, MEM(RDX ))
|
||||
PREFETCHW1(MEM(RDX ))
|
||||
SUBITER( 8,1,0,RAX,R8, 2)
|
||||
PREFETCH(1, MEM(RDX,R12,1))
|
||||
PREFETCHW1(MEM(RDX,R12,1))
|
||||
SUBITER( 9,0,1,RAX,R8, 2)
|
||||
PREFETCH(1, MEM(RDX,R12,2))
|
||||
PREFETCHW1(MEM(RDX,R12,2))
|
||||
SUBITER(10,1,0,RAX,R8, 2)
|
||||
PREFETCH(1, MEM(RDX,R13,1))
|
||||
PREFETCHW1(MEM(RDX,R13,1))
|
||||
SUBITER(11,0,1,RAX,R8, 2)
|
||||
PREFETCH(1, MEM(RDX,R12,4))
|
||||
PREFETCHW1(MEM(RDX,R12,4))
|
||||
SUBITER(12,1,0,RAX,R9, 1)
|
||||
PREFETCH(1, MEM(RDX,R14,1))
|
||||
PREFETCHW1(MEM(RDX,R14,1))
|
||||
SUBITER(13,0,1,RAX,R9, 1)
|
||||
PREFETCH(1, MEM(RDX,R13,2))
|
||||
PREFETCHW1(MEM(RDX,R13,2))
|
||||
SUBITER(14,1,0,RAX,R9, 1)
|
||||
PREFETCH(1, MEM(RDX,R15,1))
|
||||
PREFETCHW1(MEM(RDX,R15,1))
|
||||
SUBITER(15,0,1,RAX,R9, 1)
|
||||
|
||||
LEA(RDX, MEM(RDX,R12,8))
|
||||
|
||||
PREFETCH(1, MEM(RDI ))
|
||||
PREFETCHW1(MEM(RDI ))
|
||||
SUBITER(16,1,0,RAX,R8, 4)
|
||||
PREFETCH(1, MEM(RDI,R12,1))
|
||||
PREFETCHW1(MEM(RDI,R12,1))
|
||||
SUBITER(17,0,1,RAX,R8, 4)
|
||||
PREFETCH(1, MEM(RDI,R12,2))
|
||||
PREFETCHW1(MEM(RDI,R12,2))
|
||||
SUBITER(18,1,0,RAX,R8, 4)
|
||||
PREFETCH(1, MEM(RDI,R13,1))
|
||||
PREFETCHW1(MEM(RDI,R13,1))
|
||||
SUBITER(19,0,1,RAX,R8, 4)
|
||||
PREFETCH(1, MEM(RDI,R12,4))
|
||||
PREFETCHW1(MEM(RDI,R12,4))
|
||||
SUBITER(20,1,0,RAX,R10,1)
|
||||
PREFETCH(1, MEM(RDI,R14,1))
|
||||
PREFETCHW1(MEM(RDI,R14,1))
|
||||
SUBITER(21,0,1,RAX,R10,1)
|
||||
PREFETCH(1, MEM(RDI,R13,2))
|
||||
PREFETCHW1(MEM(RDI,R13,2))
|
||||
SUBITER(22,1,0,RAX,R10,1)
|
||||
PREFETCH(1, MEM(RDI,R15,1))
|
||||
PREFETCHW1(MEM(RDI,R15,1))
|
||||
SUBITER(23,0,1,RAX,R10,1)
|
||||
|
||||
ADD(RAX, IMM(24*24*8))
|
||||
@@ -464,58 +464,58 @@ void bli_dgemm_opt_24x8(
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX ))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R12,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RCX,R12,2))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX ))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2))
|
||||
SUBITER(0,1,0,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX,R13,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R12,4))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RCX,R14,1))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1))
|
||||
SUBITER(1,0,1,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RCX,R13,2))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RCX,R15,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX ))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX ))
|
||||
SUBITER(2,1,0,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R12,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDX,R12,2))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX,R13,1))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1))
|
||||
SUBITER(3,0,1,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R12,4))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDX,R14,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDX,R13,2))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2))
|
||||
SUBITER(4,1,0,RAX,R8,1)
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDX,R15,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI ))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R12,1))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI ))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1))
|
||||
SUBITER(5,0,1,RAX,R8,1)
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDI,R12,2))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI,R13,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R12,4))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4))
|
||||
SUBITER(6,1,0,RAX,R8,1)
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCH(0, MEM(RDI,R14,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCH(0, MEM(RDI,R13,2))
|
||||
#define PREFETCH_C_L1_3 PREFETCH(0, MEM(RDI,R15,1))
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1))
|
||||
SUBITER(7,0,1,RAX,R8,1)
|
||||
#endif
|
||||
|
||||
@@ -528,7 +528,7 @@ void bli_dgemm_opt_24x8(
|
||||
|
||||
LABEL(TAIL_LOOP)
|
||||
|
||||
PREFETCH(0, MEM(RDX))
|
||||
PREFETCHW0(MEM(RDX))
|
||||
ADD(RDX, R12)
|
||||
|
||||
SUBITER(0,1,0,RAX)
|
||||
|
||||
Reference in New Issue
Block a user