From cd5b6681838899283cd94e5427dfda206e7fbabe Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 25 Oct 2016 13:49:27 -0500 Subject: [PATCH] Don't use %rbp in KNL packing kernels. --- kernels/x86_64/knl/1m/bli_avx512_macros.h | 1 + kernels/x86_64/knl/1m/bli_packm_opt_24x8.c | 166 +++++++++++---------- kernels/x86_64/knl/1m/bli_packm_opt_30x8.c | 150 +++++++++---------- 3 files changed, 160 insertions(+), 157 deletions(-) create mode 120000 kernels/x86_64/knl/1m/bli_avx512_macros.h diff --git a/kernels/x86_64/knl/1m/bli_avx512_macros.h b/kernels/x86_64/knl/1m/bli_avx512_macros.h new file mode 120000 index 000000000..3b1d1779e --- /dev/null +++ b/kernels/x86_64/knl/1m/bli_avx512_macros.h @@ -0,0 +1 @@ +../3/bli_avx512_macros.h \ No newline at end of file diff --git a/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c index e78b2a721..dba0e88b9 100644 --- a/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c +++ b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c @@ -32,7 +32,7 @@ */ -#include "../3/bli_avx512_macros.h" +#include "bli_avx512_macros.h" #include "blis.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ @@ -122,7 +122,7 @@ void bli_dpackm_8xk_opt MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) - MOV(RBP, VAR(p)) + MOV(R14, VAR(p)) MOV(RDI, VAR(ldp)) TEST(RSI, RSI) @@ -154,10 +154,10 @@ void bli_dpackm_8xk_opt LABEL(PACK8_N_LOOP) LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) - STORE8x8(RBP,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7) + STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7) LEA(RAX, MEM(RAX,RCX,8)) - LEA(RBP, MEM(RBP,RDI,8)) + LEA(R14, MEM(R14,RDI,8)) SUB(RSI, IMM(1)) @@ -169,10 +169,10 @@ void bli_dpackm_8xk_opt LABEL(PACK8_N_TAIL) VMULPD(ZMM(0), ZMM(31), MEM(RAX)) - VMOVUPD(MEM(RBP), ZMM(0)) + VMOVUPD(MEM(R14), ZMM(0)) LEA(RAX, MEM(RAX,RCX,1)) - LEA(RBP, MEM(RBP,RDI,1)) + LEA(R14, MEM(R14,RDI,1)) SUB(RDX, IMM(1)) @@ -199,10 +199,10 @@ void bli_dpackm_8xk_opt LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) - STORE8x8(RBP,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RAX, MEM(RAX,RCX,8)) - LEA(RBP, MEM(RBP,RDI,8)) + LEA(R14, MEM(R14,RDI,8)) SUB(RSI, IMM(1)) @@ -222,25 +222,25 @@ void bli_dpackm_8xk_opt TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) - VMOVUPD(MEM(RBP ), ZMM( 8)) + VMOVUPD(MEM(R14 ), ZMM( 8)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) - VMOVUPD(MEM(RBP,RDI,1), ZMM( 9)) + VMOVUPD(MEM(R14,RDI,1), ZMM( 9)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) - VMOVUPD(MEM(RBP,RDI,2), ZMM(10)) + VMOVUPD(MEM(R14,RDI,2), ZMM(10)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) - VMOVUPD(MEM(RBP,R11,1), ZMM(11)) + VMOVUPD(MEM(R14,R11,1), ZMM(11)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) - VMOVUPD(MEM(RBP,RDI,4), ZMM(12)) + VMOVUPD(MEM(R14,RDI,4), ZMM(12)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) - VMOVUPD(MEM(RBP,R12,1), ZMM(13)) + VMOVUPD(MEM(R14,R12,1), ZMM(13)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) - VMOVUPD(MEM(RBP,R11,2), ZMM(14)) + VMOVUPD(MEM(R14,R11,2), ZMM(14)) JMP(PACK8_DONE) @@ -255,10 +255,10 @@ void bli_dpackm_8xk_opt KXNORW(K(1), K(0), K(0)) VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) VMULPD(ZMM(3), ZMM(3), ZMM(31)) - VMOVUPD(MEM(RBP), ZMM(3)) + VMOVUPD(MEM(R14), ZMM(3)) LEA(RAX, MEM(RAX,RCX,1)) - LEA(RBP, MEM(RBP,RDI,1)) + LEA(R14, MEM(R14,RDI,1)) SUB(RSI, IMM(1)) @@ -283,8 +283,8 @@ void bli_dpackm_8xk_opt "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi", - "r8", "r9", "r10", "r11", "r12", "r13", "memory" + "rax", "rbx", "rcx", "rdx", "rdi", "rsi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" ); } @@ -308,12 +308,12 @@ void bli_dpackm_24xk_opt MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) - MOV(RBP, VAR(p)) + MOV(R15, VAR(p)) MOV(RDI, VAR(ldp)) LEA(RBX, MEM(,RBX,8)) //inca in bytes - LEA(RCX, MEM(,RCX,8)) //lda in bytes - LEA(RDI, MEM(,RDI,8)) //ldp in bytes + LEA(RCX, MEM(,RCX,8)) //lda in bytes + LEA(RDI, MEM(,RDI,8)) //ldp in bytes LEA(R11, MEM(RDI,RDI,2)) //ldp*3 LEA(R12, MEM(RDI,RDI,4)) //ldp*5 LEA(R13, MEM(R11,RDI,4)) //ldp*7 @@ -328,8 +328,6 @@ void bli_dpackm_24xk_opt LABEL(PACK24_N) - MOV(RDX, RSI) - AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK24_N_TAIL) @@ -342,35 +340,39 @@ void bli_dpackm_24xk_opt LOADMUL8x8(RAX, 0,RCX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) LOADMUL8x8(RAX, 64,RCX,R8,R9,R10, 8, 9,10,11,12,13,14,15) LOADMUL8x8(RAX,128,RCX,R8,R9,R10,16,17,18,19,20,21,22,23) - STORE8x8(RBP, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) - STORE8x8(RBP, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15) - STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R15, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) + STORE8x8(R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15) + STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RAX, MEM(RAX,RCX,8)) - LEA(RBP, MEM(RBP,RDI,8)) + LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK24_N_LOOP) - TEST(RDX, RDX) + LABEL(PACK24_N_TAIL) + + MOV(RSI, VAR(n)) + AND(RSI, IMM(7)) + TEST(RSI, RSI) JZ(PACK24_DONE) - LABEL(PACK24_N_TAIL) + LABEL(PACK24_N_TAIL_LOOP) VMULPD(ZMM(0), ZMM(31), MEM(RAX, 0)) VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64)) VMULPD(ZMM(2), ZMM(31), MEM(RAX,128)) - VMOVUPD(MEM(RBP, 0), ZMM(0)) - VMOVUPD(MEM(RBP, 64), ZMM(1)) - VMOVUPD(MEM(RBP,128), ZMM(2)) + VMOVUPD(MEM(R15, 0), ZMM(0)) + VMOVUPD(MEM(R15, 64), ZMM(1)) + VMOVUPD(MEM(R15,128), ZMM(2)) LEA(RAX, MEM(RAX,RCX,1)) - LEA(RBP, MEM(RBP,RDI,1)) + LEA(R15, MEM(R15,RDI,1)) - SUB(RDX, IMM(1)) + SUB(RSI, IMM(1)) - JNZ(PACK24_N_TAIL) + JNZ(PACK24_N_TAIL_LOOP) JMP(PACK24_DONE) @@ -386,8 +388,6 @@ void bli_dpackm_24xk_opt LEA(R14, MEM(RAX,RBX,8)) LEA(RCX, MEM(R14,RBX,8)) - MOV(RDX, RSI) - AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK24_T_TAIL) @@ -397,33 +397,35 @@ void bli_dpackm_24xk_opt LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) - STORE8x8(RBP, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R15, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23) - STORE8x8(RBP, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) - STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RAX, MEM(RAX,64)) LEA(R14, MEM(R14,64)) LEA(RCX, MEM(RCX,64)) - LEA(RBP, MEM(RBP,RDI,8)) + LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK24_T_LOOP) - TEST(RDX, RDX) - JZ(PACK24_DONE) - LABEL(PACK24_T_TAIL) - MOV(RSI, IMM(1)) - SHLX(RSI, RSI, RDX) - SUB(RSI, IMM(1)) - KMOV(K(1), ESI) //mask for n%8 elements + MOV(RSI, VAR(n)) + AND(RSI, IMM(7)) + TEST(RSI, RSI) + JZ(PACK24_DONE) + + MOV(R13, IMM(1)) + SHLX(R13, R13, RSI) + SUB(R13, IMM(1)) + KMOV(K(1), R13D) //mask for n%8 elements LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1) LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1) @@ -435,39 +437,39 @@ void bli_dpackm_24xk_opt TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) - VMOVUPD(MEM(RBP, 0), ZMM( 8)) - VMOVUPD(MEM(RBP, 64), ZMM(16)) - VMOVUPD(MEM(RBP, 128), ZMM(24)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15, 0), ZMM( 8)) + VMOVUPD(MEM(R15, 64), ZMM(16)) + VMOVUPD(MEM(R15, 128), ZMM(24)) + SUB(RSI, IMM(1)) JZ(PACK24_DONE) - VMOVUPD(MEM(RBP,RDI,1, 0), ZMM( 9)) - VMOVUPD(MEM(RBP,RDI,1, 64), ZMM(17)) - VMOVUPD(MEM(RBP,RDI,1,128), ZMM(25)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,1, 0), ZMM( 9)) + VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17)) + VMOVUPD(MEM(R15,RDI,1,128), ZMM(25)) + SUB(RSI, IMM(1)) JZ(PACK24_DONE) - VMOVUPD(MEM(RBP,RDI,2, 0), ZMM(10)) - VMOVUPD(MEM(RBP,RDI,2, 64), ZMM(18)) - VMOVUPD(MEM(RBP,RDI,2,128), ZMM(26)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,2, 0), ZMM(10)) + VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18)) + VMOVUPD(MEM(R15,RDI,2,128), ZMM(26)) + SUB(RSI, IMM(1)) JZ(PACK24_DONE) - VMOVUPD(MEM(RBP,R11,1, 0), ZMM(11)) - VMOVUPD(MEM(RBP,R11,1, 64), ZMM(19)) - VMOVUPD(MEM(RBP,R11,1,128), ZMM(27)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,R11,1, 0), ZMM(11)) + VMOVUPD(MEM(R15,R11,1, 64), ZMM(19)) + VMOVUPD(MEM(R15,R11,1,128), ZMM(27)) + SUB(RSI, IMM(1)) JZ(PACK24_DONE) - VMOVUPD(MEM(RBP,RDI,4, 0), ZMM(12)) - VMOVUPD(MEM(RBP,RDI,4, 64), ZMM(20)) - VMOVUPD(MEM(RBP,RDI,4,128), ZMM(28)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,4, 0), ZMM(12)) + VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20)) + VMOVUPD(MEM(R15,RDI,4,128), ZMM(28)) + SUB(RSI, IMM(1)) JZ(PACK24_DONE) - VMOVUPD(MEM(RBP,R12,1, 0), ZMM(13)) - VMOVUPD(MEM(RBP,R12,1, 64), ZMM(21)) - VMOVUPD(MEM(RBP,R12,1,128), ZMM(29)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,R12,1, 0), ZMM(13)) + VMOVUPD(MEM(R15,R12,1, 64), ZMM(21)) + VMOVUPD(MEM(R15,R12,1,128), ZMM(29)) + SUB(RSI, IMM(1)) JZ(PACK24_DONE) - VMOVUPD(MEM(RBP,R11,2, 0), ZMM(14)) - VMOVUPD(MEM(RBP,R11,2, 64), ZMM(22)) - VMOVUPD(MEM(RBP,R11,2,128), ZMM(30)) + VMOVUPD(MEM(R15,R11,2, 0), ZMM(14)) + VMOVUPD(MEM(R15,R11,2, 64), ZMM(22)) + VMOVUPD(MEM(R15,R11,2,128), ZMM(30)) JMP(PACK24_DONE) @@ -490,12 +492,12 @@ void bli_dpackm_24xk_opt VMULPD(ZMM(3), ZMM(3), ZMM(31)) VMULPD(ZMM(4), ZMM(4), ZMM(31)) VMULPD(ZMM(5), ZMM(5), ZMM(31)) - VMOVUPD(MEM(RBP, 0), ZMM(3)) - VMOVUPD(MEM(RBP, 64), ZMM(4)) - VMOVUPD(MEM(RBP,128), ZMM(5)) + VMOVUPD(MEM(R15, 0), ZMM(3)) + VMOVUPD(MEM(R15, 64), ZMM(4)) + VMOVUPD(MEM(R15,128), ZMM(5)) LEA(RAX, MEM(RAX,RCX,1)) - LEA(RBP, MEM(RBP,RDI,1)) + LEA(R15, MEM(R15,RDI,1)) SUB(RSI, IMM(1)) @@ -520,7 +522,7 @@ void bli_dpackm_24xk_opt "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + "rax", "rbx", "rcx", "rdi", "rsi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory" ); } diff --git a/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c b/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c index 138ead5d6..eeab3c71d 100644 --- a/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c +++ b/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c @@ -32,7 +32,7 @@ */ -#include "../3/bli_avx512_macros.h" +#include "bli_avx512_macros.h" #include "blis.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ @@ -150,7 +150,7 @@ void bli_dpackm_30xk_opt MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) - MOV(RBP, VAR(p)) + MOV(R15, VAR(p)) MOV(RDI, VAR(ldp)) LEA(RBX, MEM(,RBX,8)) //inca in bytes @@ -184,14 +184,14 @@ void bli_dpackm_30xk_opt LOADMUL8x8(RAX, 0,RCX,R8, R9, R10, 0, 1, 2, 3, 4, 5, 6, 7) LOADMUL8x8(RAX, 64,RCX,R8, R9, R10, 8, 9,10,11,12,13,14,15) LOADMUL8x8(RAX,128,RCX,R8, R9, R10,16,17,18,19,20,21,22,23) - STORE8x8 (RBP, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) - STORE8x8 (RBP, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15) - STORE8x8 (RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8 (R15, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) + STORE8x8 (R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15) + STORE8x8 (R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LOADMUL8x6(RAX,192,RCX,R8, R9, R10, 0, 1, 2, 3, 4, 5, 6, 7) - STORE8x8 (RBP,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) + STORE8x8 (R15,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) LEA(RAX, MEM(RAX,RCX,8)) - LEA(RBP, MEM(RBP,RDI,8)) + LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) @@ -209,13 +209,13 @@ void bli_dpackm_30xk_opt VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64)) VMULPD(ZMM(2), ZMM(31), MEM(RAX,128)) VMULPD(ZMM(3) MASK_KZ(7), ZMM(31), MEM(RAX,192)) - VMOVUPD(MEM(RBP, 0), ZMM(0)) - VMOVUPD(MEM(RBP, 64), ZMM(1)) - VMOVUPD(MEM(RBP,128), ZMM(2)) - VMOVUPD(MEM(RBP,192), ZMM(3)) + VMOVUPD(MEM(R15, 0), ZMM(0)) + VMOVUPD(MEM(R15, 64), ZMM(1)) + VMOVUPD(MEM(R15,128), ZMM(2)) + VMOVUPD(MEM(R15,192), ZMM(3)) LEA(RAX, MEM(RAX,RCX,1)) - LEA(RBP, MEM(RBP,RDI,1)) + LEA(R15, MEM(R15,RDI,1)) SUB(RDX, IMM(1)) @@ -235,8 +235,6 @@ void bli_dpackm_30xk_opt LEA(R14, MEM(RAX,RBX,8)) LEA(RCX, MEM(R14,RBX,8)) - MOV(RDX, RSI) - AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK30_T_TAIL) @@ -246,39 +244,40 @@ void bli_dpackm_30xk_opt LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) - STORE8x8(RBP, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R15, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23) - STORE8x8(RBP, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RCX, MEM(RCX,RBX,8)) LOADMUL6x8(RCX,0,RBX,R8,R9, 8, 9,10,11,12,13) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) - STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7) - STORE8x8(RBP,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) + STORE8x8(R15,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) LEA(RAX, MEM(RAX,64)) LEA(R14, MEM(R14,64)) LEA(RCX, MEM(R14,RBX,8)) - LEA(RBP, MEM(RBP,RDI,8)) + LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK30_T_LOOP) - TEST(RDX, RDX) - JZ(PACK30_DONE) - LABEL(PACK30_T_TAIL) - MOV(RSI, IMM(1)) - SHLX(RSI, RSI, RDX) - SUB(RSI, IMM(1)) - KMOV(K(1), ESI) //mask for n%8 elements - MOV(RSI, RDX) + MOV(RSI, VAR(n)) + AND(RSI, IMM(7)) + TEST(RSI, RSI) + JZ(PACK30_DONE) + + MOV(R13, IMM(1)) + SHLX(R13, R13, RSI) + SUB(R13, IMM(1)) + KMOV(K(1), R13D) //mask for n%8 elements LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1) LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1) @@ -290,43 +289,44 @@ void bli_dpackm_30xk_opt TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) - VMOVUPD(MEM(RBP, 0), ZMM( 8)) - VMOVUPD(MEM(RBP, 64), ZMM(16)) - VMOVUPD(MEM(RBP, 128), ZMM(24)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15, 0), ZMM( 8)) + VMOVUPD(MEM(R15, 64), ZMM(16)) + VMOVUPD(MEM(R15, 128), ZMM(24)) + SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) - VMOVUPD(MEM(RBP,RDI,1, 0), ZMM( 9)) - VMOVUPD(MEM(RBP,RDI,1, 64), ZMM(17)) - VMOVUPD(MEM(RBP,RDI,1,128), ZMM(25)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,1, 0), ZMM( 9)) + VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17)) + VMOVUPD(MEM(R15,RDI,1,128), ZMM(25)) + SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) - VMOVUPD(MEM(RBP,RDI,2, 0), ZMM(10)) - VMOVUPD(MEM(RBP,RDI,2, 64), ZMM(18)) - VMOVUPD(MEM(RBP,RDI,2,128), ZMM(26)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,2, 0), ZMM(10)) + VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18)) + VMOVUPD(MEM(R15,RDI,2,128), ZMM(26)) + SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) - VMOVUPD(MEM(RBP,R11,1, 0), ZMM(11)) - VMOVUPD(MEM(RBP,R11,1, 64), ZMM(19)) - VMOVUPD(MEM(RBP,R11,1,128), ZMM(27)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,R11,1, 0), ZMM(11)) + VMOVUPD(MEM(R15,R11,1, 64), ZMM(19)) + VMOVUPD(MEM(R15,R11,1,128), ZMM(27)) + SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) - VMOVUPD(MEM(RBP,RDI,4, 0), ZMM(12)) - VMOVUPD(MEM(RBP,RDI,4, 64), ZMM(20)) - VMOVUPD(MEM(RBP,RDI,4,128), ZMM(28)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,4, 0), ZMM(12)) + VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20)) + VMOVUPD(MEM(R15,RDI,4,128), ZMM(28)) + SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) - VMOVUPD(MEM(RBP,R12,1, 0), ZMM(13)) - VMOVUPD(MEM(RBP,R12,1, 64), ZMM(21)) - VMOVUPD(MEM(RBP,R12,1,128), ZMM(29)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,R12,1, 0), ZMM(13)) + VMOVUPD(MEM(R15,R12,1, 64), ZMM(21)) + VMOVUPD(MEM(R15,R12,1,128), ZMM(29)) + SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) - VMOVUPD(MEM(RBP,R11,2, 0), ZMM(14)) - VMOVUPD(MEM(RBP,R11,2, 64), ZMM(22)) - VMOVUPD(MEM(RBP,R11,2,128), ZMM(30)) + VMOVUPD(MEM(R15,R11,2, 0), ZMM(14)) + VMOVUPD(MEM(R15,R11,2, 64), ZMM(22)) + VMOVUPD(MEM(R15,R11,2,128), ZMM(30)) LABEL(PACK30_T_ALMOST_DONE) - MOV(RDX, RSI) + MOV(RSI, VAR(n)) + AND(RSI, IMM(7)) VBROADCASTSD(ZMM(31), VAR(kappa)) LEA(RAX, MEM(RCX,RBX,8)) @@ -334,25 +334,25 @@ void bli_dpackm_30xk_opt TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) - VMOVUPD(MEM(RBP, 192), ZMM( 8)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15, 192), ZMM( 8)) + SUB(RSI, IMM(1)) JZ(PACK30_DONE) - VMOVUPD(MEM(RBP,RDI,1,192), ZMM( 9)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,1,192), ZMM( 9)) + SUB(RSI, IMM(1)) JZ(PACK30_DONE) - VMOVUPD(MEM(RBP,RDI,2,192), ZMM(10)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,2,192), ZMM(10)) + SUB(RSI, IMM(1)) JZ(PACK30_DONE) - VMOVUPD(MEM(RBP,R11,1,192), ZMM(11)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,R11,1,192), ZMM(11)) + SUB(RSI, IMM(1)) JZ(PACK30_DONE) - VMOVUPD(MEM(RBP,RDI,4,192), ZMM(12)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,RDI,4,192), ZMM(12)) + SUB(RSI, IMM(1)) JZ(PACK30_DONE) - VMOVUPD(MEM(RBP,R12,1,192), ZMM(13)) - SUB(RDX, IMM(1)) + VMOVUPD(MEM(R15,R12,1,192), ZMM(13)) + SUB(RSI, IMM(1)) JZ(PACK30_DONE) - VMOVUPD(MEM(RBP,R11,2,192), ZMM(14)) + VMOVUPD(MEM(R15,R11,2,192), ZMM(14)) JMP(PACK30_DONE) @@ -379,13 +379,13 @@ void bli_dpackm_30xk_opt VMULPD(ZMM(5), ZMM(5), ZMM(31)) VMULPD(ZMM(6), ZMM(6), ZMM(31)) VMULPD(ZMM(7), ZMM(7), ZMM(31)) - VMOVUPD(MEM(RBP, 0), ZMM(4)) - VMOVUPD(MEM(RBP, 64), ZMM(5)) - VMOVUPD(MEM(RBP,128), ZMM(6)) - VMOVUPD(MEM(RBP,192), ZMM(7)) + VMOVUPD(MEM(R15, 0), ZMM(4)) + VMOVUPD(MEM(R15, 64), ZMM(5)) + VMOVUPD(MEM(R15,128), ZMM(6)) + VMOVUPD(MEM(R15,192), ZMM(7)) LEA(RAX, MEM(RAX,RCX,1)) - LEA(RBP, MEM(RBP,RDI,1)) + LEA(R15, MEM(R15,RDI,1)) SUB(RSI, IMM(1)) @@ -410,7 +410,7 @@ void bli_dpackm_30xk_opt "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", - "rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + "rax", "rbx", "rcx", "rdi", "rsi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory" ); }