mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Don't use %rbp in KNL packing kernels.
This commit is contained in:
1
kernels/x86_64/knl/1m/bli_avx512_macros.h
Symbolic link
1
kernels/x86_64/knl/1m/bli_avx512_macros.h
Symbolic link
@@ -0,0 +1 @@
|
||||
../3/bli_avx512_macros.h
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "../3/bli_avx512_macros.h"
|
||||
#include "bli_avx512_macros.h"
|
||||
#include "blis.h"
|
||||
|
||||
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
|
||||
@@ -122,7 +122,7 @@ void bli_dpackm_8xk_opt
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
MOV(RCX, VAR(lda))
|
||||
MOV(RBP, VAR(p))
|
||||
MOV(R14, VAR(p))
|
||||
MOV(RDI, VAR(ldp))
|
||||
|
||||
TEST(RSI, RSI)
|
||||
@@ -154,10 +154,10 @@ void bli_dpackm_8xk_opt
|
||||
LABEL(PACK8_N_LOOP)
|
||||
|
||||
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORE8x8(RBP,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
|
||||
STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,8))
|
||||
LEA(RBP, MEM(RBP,RDI,8))
|
||||
LEA(R14, MEM(R14,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
@@ -169,10 +169,10 @@ void bli_dpackm_8xk_opt
|
||||
LABEL(PACK8_N_TAIL)
|
||||
|
||||
VMULPD(ZMM(0), ZMM(31), MEM(RAX))
|
||||
VMOVUPD(MEM(RBP), ZMM(0))
|
||||
VMOVUPD(MEM(R14), ZMM(0))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(RBP, MEM(RBP,RDI,1))
|
||||
LEA(R14, MEM(R14,RDI,1))
|
||||
|
||||
SUB(RDX, IMM(1))
|
||||
|
||||
@@ -199,10 +199,10 @@ void bli_dpackm_8xk_opt
|
||||
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,8))
|
||||
LEA(RBP, MEM(RBP,RDI,8))
|
||||
LEA(R14, MEM(R14,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
@@ -222,25 +222,25 @@ void bli_dpackm_8xk_opt
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9,10,11,12,13,14,15)
|
||||
|
||||
VMOVUPD(MEM(RBP ), ZMM( 8))
|
||||
VMOVUPD(MEM(R14 ), ZMM( 8))
|
||||
SUB(RDX, IMM(1))
|
||||
JZ(PACK8_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,1), ZMM( 9))
|
||||
VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
|
||||
SUB(RDX, IMM(1))
|
||||
JZ(PACK8_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,2), ZMM(10))
|
||||
VMOVUPD(MEM(R14,RDI,2), ZMM(10))
|
||||
SUB(RDX, IMM(1))
|
||||
JZ(PACK8_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,1), ZMM(11))
|
||||
VMOVUPD(MEM(R14,R11,1), ZMM(11))
|
||||
SUB(RDX, IMM(1))
|
||||
JZ(PACK8_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,4), ZMM(12))
|
||||
VMOVUPD(MEM(R14,RDI,4), ZMM(12))
|
||||
SUB(RDX, IMM(1))
|
||||
JZ(PACK8_DONE)
|
||||
VMOVUPD(MEM(RBP,R12,1), ZMM(13))
|
||||
VMOVUPD(MEM(R14,R12,1), ZMM(13))
|
||||
SUB(RDX, IMM(1))
|
||||
JZ(PACK8_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,2), ZMM(14))
|
||||
VMOVUPD(MEM(R14,R11,2), ZMM(14))
|
||||
|
||||
JMP(PACK8_DONE)
|
||||
|
||||
@@ -255,10 +255,10 @@ void bli_dpackm_8xk_opt
|
||||
KXNORW(K(1), K(0), K(0))
|
||||
VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
|
||||
VMULPD(ZMM(3), ZMM(3), ZMM(31))
|
||||
VMOVUPD(MEM(RBP), ZMM(3))
|
||||
VMOVUPD(MEM(R14), ZMM(3))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(RBP, MEM(RBP,RDI,1))
|
||||
LEA(R14, MEM(R14,RDI,1))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
@@ -283,8 +283,8 @@ void bli_dpackm_8xk_opt
|
||||
"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
|
||||
"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "memory"
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -308,12 +308,12 @@ void bli_dpackm_24xk_opt
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
MOV(RCX, VAR(lda))
|
||||
MOV(RBP, VAR(p))
|
||||
MOV(R15, VAR(p))
|
||||
MOV(RDI, VAR(ldp))
|
||||
|
||||
LEA(RBX, MEM(,RBX,8)) //inca in bytes
|
||||
LEA(RCX, MEM(,RCX,8)) //lda in bytes
|
||||
LEA(RDI, MEM(,RDI,8)) //ldp in bytes
|
||||
LEA(RCX, MEM(,RCX,8)) //lda in bytes
|
||||
LEA(RDI, MEM(,RDI,8)) //ldp in bytes
|
||||
LEA(R11, MEM(RDI,RDI,2)) //ldp*3
|
||||
LEA(R12, MEM(RDI,RDI,4)) //ldp*5
|
||||
LEA(R13, MEM(R11,RDI,4)) //ldp*7
|
||||
@@ -328,8 +328,6 @@ void bli_dpackm_24xk_opt
|
||||
|
||||
LABEL(PACK24_N)
|
||||
|
||||
MOV(RDX, RSI)
|
||||
AND(RDX, IMM(7))
|
||||
SAR(RSI, IMM(3))
|
||||
JZ(PACK24_N_TAIL)
|
||||
|
||||
@@ -342,35 +340,39 @@ void bli_dpackm_24xk_opt
|
||||
LOADMUL8x8(RAX, 0,RCX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
LOADMUL8x8(RAX, 64,RCX,R8,R9,R10, 8, 9,10,11,12,13,14,15)
|
||||
LOADMUL8x8(RAX,128,RCX,R8,R9,R10,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8(RBP, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15)
|
||||
STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R15, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8(R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15)
|
||||
STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,8))
|
||||
LEA(RBP, MEM(RBP,RDI,8))
|
||||
LEA(R15, MEM(R15,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK24_N_LOOP)
|
||||
|
||||
TEST(RDX, RDX)
|
||||
LABEL(PACK24_N_TAIL)
|
||||
|
||||
MOV(RSI, VAR(n))
|
||||
AND(RSI, IMM(7))
|
||||
TEST(RSI, RSI)
|
||||
JZ(PACK24_DONE)
|
||||
|
||||
LABEL(PACK24_N_TAIL)
|
||||
LABEL(PACK24_N_TAIL_LOOP)
|
||||
|
||||
VMULPD(ZMM(0), ZMM(31), MEM(RAX, 0))
|
||||
VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64))
|
||||
VMULPD(ZMM(2), ZMM(31), MEM(RAX,128))
|
||||
VMOVUPD(MEM(RBP, 0), ZMM(0))
|
||||
VMOVUPD(MEM(RBP, 64), ZMM(1))
|
||||
VMOVUPD(MEM(RBP,128), ZMM(2))
|
||||
VMOVUPD(MEM(R15, 0), ZMM(0))
|
||||
VMOVUPD(MEM(R15, 64), ZMM(1))
|
||||
VMOVUPD(MEM(R15,128), ZMM(2))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(RBP, MEM(RBP,RDI,1))
|
||||
LEA(R15, MEM(R15,RDI,1))
|
||||
|
||||
SUB(RDX, IMM(1))
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK24_N_TAIL)
|
||||
JNZ(PACK24_N_TAIL_LOOP)
|
||||
|
||||
JMP(PACK24_DONE)
|
||||
|
||||
@@ -386,8 +388,6 @@ void bli_dpackm_24xk_opt
|
||||
LEA(R14, MEM(RAX,RBX,8))
|
||||
LEA(RCX, MEM(R14,RBX,8))
|
||||
|
||||
MOV(RDX, RSI)
|
||||
AND(RDX, IMM(7))
|
||||
SAR(RSI, IMM(3))
|
||||
JZ(PACK24_T_TAIL)
|
||||
|
||||
@@ -397,33 +397,35 @@ void bli_dpackm_24xk_opt
|
||||
LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15)
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R15, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
TRANSPOSE8x8( 8, 9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
|
||||
LEA(RAX, MEM(RAX,64))
|
||||
LEA(R14, MEM(R14,64))
|
||||
LEA(RCX, MEM(RCX,64))
|
||||
LEA(RBP, MEM(RBP,RDI,8))
|
||||
LEA(R15, MEM(R15,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK24_T_LOOP)
|
||||
|
||||
TEST(RDX, RDX)
|
||||
JZ(PACK24_DONE)
|
||||
|
||||
LABEL(PACK24_T_TAIL)
|
||||
|
||||
MOV(RSI, IMM(1))
|
||||
SHLX(RSI, RSI, RDX)
|
||||
SUB(RSI, IMM(1))
|
||||
KMOV(K(1), ESI) //mask for n%8 elements
|
||||
MOV(RSI, VAR(n))
|
||||
AND(RSI, IMM(7))
|
||||
TEST(RSI, RSI)
|
||||
JZ(PACK24_DONE)
|
||||
|
||||
MOV(R13, IMM(1))
|
||||
SHLX(R13, R13, RSI)
|
||||
SUB(R13, IMM(1))
|
||||
KMOV(K(1), R13D) //mask for n%8 elements
|
||||
|
||||
LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1)
|
||||
LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1)
|
||||
@@ -435,39 +437,39 @@ void bli_dpackm_24xk_opt
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9,10,11,12,13,14,15)
|
||||
|
||||
VMOVUPD(MEM(RBP, 0), ZMM( 8))
|
||||
VMOVUPD(MEM(RBP, 64), ZMM(16))
|
||||
VMOVUPD(MEM(RBP, 128), ZMM(24))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15, 0), ZMM( 8))
|
||||
VMOVUPD(MEM(R15, 64), ZMM(16))
|
||||
VMOVUPD(MEM(R15, 128), ZMM(24))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK24_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,1, 0), ZMM( 9))
|
||||
VMOVUPD(MEM(RBP,RDI,1, 64), ZMM(17))
|
||||
VMOVUPD(MEM(RBP,RDI,1,128), ZMM(25))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,1, 0), ZMM( 9))
|
||||
VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17))
|
||||
VMOVUPD(MEM(R15,RDI,1,128), ZMM(25))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK24_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,2, 0), ZMM(10))
|
||||
VMOVUPD(MEM(RBP,RDI,2, 64), ZMM(18))
|
||||
VMOVUPD(MEM(RBP,RDI,2,128), ZMM(26))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,2, 0), ZMM(10))
|
||||
VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18))
|
||||
VMOVUPD(MEM(R15,RDI,2,128), ZMM(26))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK24_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,1, 0), ZMM(11))
|
||||
VMOVUPD(MEM(RBP,R11,1, 64), ZMM(19))
|
||||
VMOVUPD(MEM(RBP,R11,1,128), ZMM(27))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,R11,1, 0), ZMM(11))
|
||||
VMOVUPD(MEM(R15,R11,1, 64), ZMM(19))
|
||||
VMOVUPD(MEM(R15,R11,1,128), ZMM(27))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK24_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,4, 0), ZMM(12))
|
||||
VMOVUPD(MEM(RBP,RDI,4, 64), ZMM(20))
|
||||
VMOVUPD(MEM(RBP,RDI,4,128), ZMM(28))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,4, 0), ZMM(12))
|
||||
VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20))
|
||||
VMOVUPD(MEM(R15,RDI,4,128), ZMM(28))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK24_DONE)
|
||||
VMOVUPD(MEM(RBP,R12,1, 0), ZMM(13))
|
||||
VMOVUPD(MEM(RBP,R12,1, 64), ZMM(21))
|
||||
VMOVUPD(MEM(RBP,R12,1,128), ZMM(29))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,R12,1, 0), ZMM(13))
|
||||
VMOVUPD(MEM(R15,R12,1, 64), ZMM(21))
|
||||
VMOVUPD(MEM(R15,R12,1,128), ZMM(29))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK24_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,2, 0), ZMM(14))
|
||||
VMOVUPD(MEM(RBP,R11,2, 64), ZMM(22))
|
||||
VMOVUPD(MEM(RBP,R11,2,128), ZMM(30))
|
||||
VMOVUPD(MEM(R15,R11,2, 0), ZMM(14))
|
||||
VMOVUPD(MEM(R15,R11,2, 64), ZMM(22))
|
||||
VMOVUPD(MEM(R15,R11,2,128), ZMM(30))
|
||||
|
||||
JMP(PACK24_DONE)
|
||||
|
||||
@@ -490,12 +492,12 @@ void bli_dpackm_24xk_opt
|
||||
VMULPD(ZMM(3), ZMM(3), ZMM(31))
|
||||
VMULPD(ZMM(4), ZMM(4), ZMM(31))
|
||||
VMULPD(ZMM(5), ZMM(5), ZMM(31))
|
||||
VMOVUPD(MEM(RBP, 0), ZMM(3))
|
||||
VMOVUPD(MEM(RBP, 64), ZMM(4))
|
||||
VMOVUPD(MEM(RBP,128), ZMM(5))
|
||||
VMOVUPD(MEM(R15, 0), ZMM(3))
|
||||
VMOVUPD(MEM(R15, 64), ZMM(4))
|
||||
VMOVUPD(MEM(R15,128), ZMM(5))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(RBP, MEM(RBP,RDI,1))
|
||||
LEA(R15, MEM(R15,RDI,1))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
@@ -520,7 +522,7 @@ void bli_dpackm_24xk_opt
|
||||
"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
|
||||
"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
|
||||
"rax", "rbx", "rcx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "../3/bli_avx512_macros.h"
|
||||
#include "bli_avx512_macros.h"
|
||||
#include "blis.h"
|
||||
|
||||
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
|
||||
@@ -150,7 +150,7 @@ void bli_dpackm_30xk_opt
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
MOV(RCX, VAR(lda))
|
||||
MOV(RBP, VAR(p))
|
||||
MOV(R15, VAR(p))
|
||||
MOV(RDI, VAR(ldp))
|
||||
|
||||
LEA(RBX, MEM(,RBX,8)) //inca in bytes
|
||||
@@ -184,14 +184,14 @@ void bli_dpackm_30xk_opt
|
||||
LOADMUL8x8(RAX, 0,RCX,R8, R9, R10, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
LOADMUL8x8(RAX, 64,RCX,R8, R9, R10, 8, 9,10,11,12,13,14,15)
|
||||
LOADMUL8x8(RAX,128,RCX,R8, R9, R10,16,17,18,19,20,21,22,23)
|
||||
STORE8x8 (RBP, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8 (RBP, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15)
|
||||
STORE8x8 (RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8 (R15, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8 (R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15)
|
||||
STORE8x8 (R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
LOADMUL8x6(RAX,192,RCX,R8, R9, R10, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8 (RBP,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8 (R15,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,8))
|
||||
LEA(RBP, MEM(RBP,RDI,8))
|
||||
LEA(R15, MEM(R15,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
@@ -209,13 +209,13 @@ void bli_dpackm_30xk_opt
|
||||
VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64))
|
||||
VMULPD(ZMM(2), ZMM(31), MEM(RAX,128))
|
||||
VMULPD(ZMM(3) MASK_KZ(7), ZMM(31), MEM(RAX,192))
|
||||
VMOVUPD(MEM(RBP, 0), ZMM(0))
|
||||
VMOVUPD(MEM(RBP, 64), ZMM(1))
|
||||
VMOVUPD(MEM(RBP,128), ZMM(2))
|
||||
VMOVUPD(MEM(RBP,192), ZMM(3))
|
||||
VMOVUPD(MEM(R15, 0), ZMM(0))
|
||||
VMOVUPD(MEM(R15, 64), ZMM(1))
|
||||
VMOVUPD(MEM(R15,128), ZMM(2))
|
||||
VMOVUPD(MEM(R15,192), ZMM(3))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(RBP, MEM(RBP,RDI,1))
|
||||
LEA(R15, MEM(R15,RDI,1))
|
||||
|
||||
SUB(RDX, IMM(1))
|
||||
|
||||
@@ -235,8 +235,6 @@ void bli_dpackm_30xk_opt
|
||||
LEA(R14, MEM(RAX,RBX,8))
|
||||
LEA(RCX, MEM(R14,RBX,8))
|
||||
|
||||
MOV(RDX, RSI)
|
||||
AND(RDX, IMM(7))
|
||||
SAR(RSI, IMM(3))
|
||||
JZ(PACK30_T_TAIL)
|
||||
|
||||
@@ -246,39 +244,40 @@ void bli_dpackm_30xk_opt
|
||||
LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15)
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R15, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
TRANSPOSE8x8( 8, 9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
LEA(RCX, MEM(RCX,RBX,8))
|
||||
LOADMUL6x8(RCX,0,RBX,R8,R9, 8, 9,10,11,12,13)
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
16,17,18,19,20,21,22,23)
|
||||
STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
|
||||
TRANSPOSE8x8( 8, 9,10,11,12,13,14,15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8(RBP,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
STORE8x8(R15,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
|
||||
|
||||
LEA(RAX, MEM(RAX,64))
|
||||
LEA(R14, MEM(R14,64))
|
||||
LEA(RCX, MEM(R14,RBX,8))
|
||||
LEA(RBP, MEM(RBP,RDI,8))
|
||||
LEA(R15, MEM(R15,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK30_T_LOOP)
|
||||
|
||||
TEST(RDX, RDX)
|
||||
JZ(PACK30_DONE)
|
||||
|
||||
LABEL(PACK30_T_TAIL)
|
||||
|
||||
MOV(RSI, IMM(1))
|
||||
SHLX(RSI, RSI, RDX)
|
||||
SUB(RSI, IMM(1))
|
||||
KMOV(K(1), ESI) //mask for n%8 elements
|
||||
MOV(RSI, RDX)
|
||||
MOV(RSI, VAR(n))
|
||||
AND(RSI, IMM(7))
|
||||
TEST(RSI, RSI)
|
||||
JZ(PACK30_DONE)
|
||||
|
||||
MOV(R13, IMM(1))
|
||||
SHLX(R13, R13, RSI)
|
||||
SUB(R13, IMM(1))
|
||||
KMOV(K(1), R13D) //mask for n%8 elements
|
||||
|
||||
LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1)
|
||||
LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1)
|
||||
@@ -290,43 +289,44 @@ void bli_dpackm_30xk_opt
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9,10,11,12,13,14,15)
|
||||
|
||||
VMOVUPD(MEM(RBP, 0), ZMM( 8))
|
||||
VMOVUPD(MEM(RBP, 64), ZMM(16))
|
||||
VMOVUPD(MEM(RBP, 128), ZMM(24))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15, 0), ZMM( 8))
|
||||
VMOVUPD(MEM(R15, 64), ZMM(16))
|
||||
VMOVUPD(MEM(R15, 128), ZMM(24))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_T_ALMOST_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,1, 0), ZMM( 9))
|
||||
VMOVUPD(MEM(RBP,RDI,1, 64), ZMM(17))
|
||||
VMOVUPD(MEM(RBP,RDI,1,128), ZMM(25))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,1, 0), ZMM( 9))
|
||||
VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17))
|
||||
VMOVUPD(MEM(R15,RDI,1,128), ZMM(25))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_T_ALMOST_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,2, 0), ZMM(10))
|
||||
VMOVUPD(MEM(RBP,RDI,2, 64), ZMM(18))
|
||||
VMOVUPD(MEM(RBP,RDI,2,128), ZMM(26))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,2, 0), ZMM(10))
|
||||
VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18))
|
||||
VMOVUPD(MEM(R15,RDI,2,128), ZMM(26))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_T_ALMOST_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,1, 0), ZMM(11))
|
||||
VMOVUPD(MEM(RBP,R11,1, 64), ZMM(19))
|
||||
VMOVUPD(MEM(RBP,R11,1,128), ZMM(27))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,R11,1, 0), ZMM(11))
|
||||
VMOVUPD(MEM(R15,R11,1, 64), ZMM(19))
|
||||
VMOVUPD(MEM(R15,R11,1,128), ZMM(27))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_T_ALMOST_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,4, 0), ZMM(12))
|
||||
VMOVUPD(MEM(RBP,RDI,4, 64), ZMM(20))
|
||||
VMOVUPD(MEM(RBP,RDI,4,128), ZMM(28))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,4, 0), ZMM(12))
|
||||
VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20))
|
||||
VMOVUPD(MEM(R15,RDI,4,128), ZMM(28))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_T_ALMOST_DONE)
|
||||
VMOVUPD(MEM(RBP,R12,1, 0), ZMM(13))
|
||||
VMOVUPD(MEM(RBP,R12,1, 64), ZMM(21))
|
||||
VMOVUPD(MEM(RBP,R12,1,128), ZMM(29))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,R12,1, 0), ZMM(13))
|
||||
VMOVUPD(MEM(R15,R12,1, 64), ZMM(21))
|
||||
VMOVUPD(MEM(R15,R12,1,128), ZMM(29))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_T_ALMOST_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,2, 0), ZMM(14))
|
||||
VMOVUPD(MEM(RBP,R11,2, 64), ZMM(22))
|
||||
VMOVUPD(MEM(RBP,R11,2,128), ZMM(30))
|
||||
VMOVUPD(MEM(R15,R11,2, 0), ZMM(14))
|
||||
VMOVUPD(MEM(R15,R11,2, 64), ZMM(22))
|
||||
VMOVUPD(MEM(R15,R11,2,128), ZMM(30))
|
||||
|
||||
LABEL(PACK30_T_ALMOST_DONE)
|
||||
|
||||
MOV(RDX, RSI)
|
||||
MOV(RSI, VAR(n))
|
||||
AND(RSI, IMM(7))
|
||||
VBROADCASTSD(ZMM(31), VAR(kappa))
|
||||
|
||||
LEA(RAX, MEM(RCX,RBX,8))
|
||||
@@ -334,25 +334,25 @@ void bli_dpackm_30xk_opt
|
||||
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9,10,11,12,13,14,15)
|
||||
|
||||
VMOVUPD(MEM(RBP, 192), ZMM( 8))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15, 192), ZMM( 8))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,1,192), ZMM( 9))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,1,192), ZMM( 9))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,2,192), ZMM(10))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,2,192), ZMM(10))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,1,192), ZMM(11))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,R11,1,192), ZMM(11))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_DONE)
|
||||
VMOVUPD(MEM(RBP,RDI,4,192), ZMM(12))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,RDI,4,192), ZMM(12))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_DONE)
|
||||
VMOVUPD(MEM(RBP,R12,1,192), ZMM(13))
|
||||
SUB(RDX, IMM(1))
|
||||
VMOVUPD(MEM(R15,R12,1,192), ZMM(13))
|
||||
SUB(RSI, IMM(1))
|
||||
JZ(PACK30_DONE)
|
||||
VMOVUPD(MEM(RBP,R11,2,192), ZMM(14))
|
||||
VMOVUPD(MEM(R15,R11,2,192), ZMM(14))
|
||||
|
||||
JMP(PACK30_DONE)
|
||||
|
||||
@@ -379,13 +379,13 @@ void bli_dpackm_30xk_opt
|
||||
VMULPD(ZMM(5), ZMM(5), ZMM(31))
|
||||
VMULPD(ZMM(6), ZMM(6), ZMM(31))
|
||||
VMULPD(ZMM(7), ZMM(7), ZMM(31))
|
||||
VMOVUPD(MEM(RBP, 0), ZMM(4))
|
||||
VMOVUPD(MEM(RBP, 64), ZMM(5))
|
||||
VMOVUPD(MEM(RBP,128), ZMM(6))
|
||||
VMOVUPD(MEM(RBP,192), ZMM(7))
|
||||
VMOVUPD(MEM(R15, 0), ZMM(4))
|
||||
VMOVUPD(MEM(R15, 64), ZMM(5))
|
||||
VMOVUPD(MEM(R15,128), ZMM(6))
|
||||
VMOVUPD(MEM(R15,192), ZMM(7))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(RBP, MEM(RBP,RDI,1))
|
||||
LEA(R15, MEM(R15,RDI,1))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
@@ -410,7 +410,7 @@ void bli_dpackm_30xk_opt
|
||||
"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
|
||||
"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
|
||||
"rax", "rbx", "rcx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user