diff --git a/config/knl/bli_kernel.h b/config/knl/bli_kernel.h index cd4dae887..6f368ec2d 100644 --- a/config/knl/bli_kernel.h +++ b/config/knl/bli_kernel.h @@ -104,6 +104,9 @@ #define BLIS_PACKDIM_MR_D 24 #define BLIS_PACKDIM_NR_D 8 +#define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt +#define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt + #endif /* diff --git a/frame/include/bli_kernel_prototypes.h b/frame/include/bli_kernel_prototypes.h index e693825ff..d05505305 100644 --- a/frame/include/bli_kernel_prototypes.h +++ b/frame/include/bli_kernel_prototypes.h @@ -117,6 +117,11 @@ #define bli_cpackm_16xk_ker_name BLIS_CPACKM_16XK_KERNEL #define bli_zpackm_16xk_ker_name BLIS_ZPACKM_16XK_KERNEL +#define bli_spackm_24xk_ker_name BLIS_SPACKM_24XK_KERNEL +#define bli_dpackm_24xk_ker_name BLIS_DPACKM_24XK_KERNEL +#define bli_cpackm_24xk_ker_name BLIS_CPACKM_24XK_KERNEL +#define bli_zpackm_24xk_ker_name BLIS_ZPACKM_24XK_KERNEL + #define bli_spackm_30xk_ker_name BLIS_SPACKM_30XK_KERNEL #define bli_dpackm_30xk_ker_name BLIS_DPACKM_30XK_KERNEL #define bli_cpackm_30xk_ker_name BLIS_CPACKM_30XK_KERNEL diff --git a/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c new file mode 100644 index 000000000..e78b2a721 --- /dev/null +++ b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c @@ -0,0 +1,526 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "../3/bli_avx512_macros.h" +#include "blis.h" + +#define LOADMUL8x8(a,o,s1,s3,s5,s7, \ + z0,z1,z2,z3,z4,z5,z6,z7) \ + \ + VMULPD(ZMM(z0), ZMM(31), MEM(a, o)) \ + VMULPD(ZMM(z1), ZMM(31), MEM(a,s1,1,o)) \ + VMULPD(ZMM(z2), ZMM(31), MEM(a,s1,2,o)) \ + VMULPD(ZMM(z3), ZMM(31), MEM(a,s3,1,o)) \ + VMULPD(ZMM(z4), ZMM(31), MEM(a,s1,4,o)) \ + VMULPD(ZMM(z5), ZMM(31), MEM(a,s5,1,o)) \ + VMULPD(ZMM(z6), ZMM(31), MEM(a,s3,2,o)) \ + VMULPD(ZMM(z7), ZMM(31), MEM(a,s7,1,o)) + +#define LOADMUL8x8_MASK(a,o,s1,s3,s5,s7, \ + z0,z1,z2,z3,z4,z5,z6,z7,k) \ + \ + VMULPD(ZMM(z0) MASK_KZ(k), ZMM(31), MEM(a, o)) \ + VMULPD(ZMM(z1) MASK_KZ(k), ZMM(31), MEM(a,s1,1,o)) \ + VMULPD(ZMM(z2) MASK_KZ(k), ZMM(31), MEM(a,s1,2,o)) \ + VMULPD(ZMM(z3) MASK_KZ(k), ZMM(31), MEM(a,s3,1,o)) \ + VMULPD(ZMM(z4) MASK_KZ(k), ZMM(31), MEM(a,s1,4,o)) \ + VMULPD(ZMM(z5) MASK_KZ(k), ZMM(31), MEM(a,s5,1,o)) \ + VMULPD(ZMM(z6) MASK_KZ(k), ZMM(31), MEM(a,s3,2,o)) \ + VMULPD(ZMM(z7) MASK_KZ(k), ZMM(31), MEM(a,s7,1,o)) + +#define STORE8x8(a,o,s1,s3,s5,s7, \ + z0,z1,z2,z3,z4,z5,z6,z7) \ + \ + VMOVUPD(MEM(a, o), ZMM(z0)) \ + VMOVUPD(MEM(a,s1,1,o), ZMM(z1)) \ + VMOVUPD(MEM(a,s1,2,o), ZMM(z2)) \ + VMOVUPD(MEM(a,s3,1,o), ZMM(z3)) \ + VMOVUPD(MEM(a,s1,4,o), ZMM(z4)) \ + VMOVUPD(MEM(a,s5,1,o), ZMM(z5)) \ + VMOVUPD(MEM(a,s3,2,o), ZMM(z6)) \ + VMOVUPD(MEM(a,s7,1,o), ZMM(z7)) + +#define TRANSPOSE8x8(a0,a1,a2,a3,a4,a5,a6,a7, \ + b0,b1,b2,b3,b4,b5,b6,b7) \ + \ + VUNPCKLPD(ZMM(b0), ZMM(a0), ZMM(a1)) \ + VUNPCKHPD(ZMM(b1), ZMM(a0), ZMM(a1)) \ + VUNPCKLPD(ZMM(b2), ZMM(a2), ZMM(a3)) \ + VUNPCKHPD(ZMM(b3), ZMM(a2), ZMM(a3)) \ + VUNPCKLPD(ZMM(b4), ZMM(a4), ZMM(a5)) \ + VUNPCKHPD(ZMM(b5), ZMM(a4), ZMM(a5)) \ + VUNPCKLPD(ZMM(b6), ZMM(a6), ZMM(a7)) \ + VUNPCKHPD(ZMM(b7), ZMM(a6), ZMM(a7)) \ + VSHUFF64X2(ZMM(a0), ZMM(b0), ZMM(b2), IMM(0x44)) \ + VSHUFF64X2(ZMM(a1), ZMM(b1), ZMM(b3), IMM(0x44)) \ + VSHUFF64X2(ZMM(a2), ZMM(b0), ZMM(b2), IMM(0xEE)) \ + VSHUFF64X2(ZMM(a3), ZMM(b1), ZMM(b3), IMM(0xEE)) \ + VSHUFF64X2(ZMM(a4), ZMM(b4), ZMM(b6), IMM(0x44)) \ + VSHUFF64X2(ZMM(a5), ZMM(b5), ZMM(b7), IMM(0x44)) \ + VSHUFF64X2(ZMM(a6), ZMM(b4), ZMM(b6), IMM(0xEE)) \ + VSHUFF64X2(ZMM(a7), ZMM(b5), ZMM(b7), IMM(0xEE)) \ + VSHUFF64X2(ZMM(b0), ZMM(a0), ZMM(a4), IMM(0x88)) \ + VSHUFF64X2(ZMM(b1), ZMM(a1), ZMM(a5), IMM(0x88)) \ + VSHUFF64X2(ZMM(b2), ZMM(a0), ZMM(a4), IMM(0xDD)) \ + VSHUFF64X2(ZMM(b3), ZMM(a1), ZMM(a5), IMM(0xDD)) \ + VSHUFF64X2(ZMM(b4), ZMM(a2), ZMM(a6), IMM(0x88)) \ + VSHUFF64X2(ZMM(b5), ZMM(a3), ZMM(a7), IMM(0x88)) \ + VSHUFF64X2(ZMM(b6), ZMM(a2), ZMM(a6), IMM(0xDD)) \ + VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD)) + +//This is an array used for the scatter/gather instructions. +extern int32_t offsets[24]; + +void bli_dpackm_8xk_opt + ( + conj_t conja, + dim_t n, + void* restrict kappa_, + void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp + ) +{ + const int32_t * offsetPtr = &offsets[0]; + double* a = (double*)a_; + double* p = (double*)p_; + double* kappa = (double*)kappa_; + + __asm__ volatile + ( + MOV(RSI, VAR(n)) + MOV(RAX, VAR(a)) + MOV(RBX, VAR(inca)) + MOV(RCX, VAR(lda)) + MOV(RBP, VAR(p)) + MOV(RDI, VAR(ldp)) + + TEST(RSI, RSI) + JZ(PACK8_DONE) + + LEA(RBX, MEM(,RBX,8)) //inca in bytes + LEA(RCX, MEM(,RCX,8)) //lda in bytes + LEA(RDI, MEM(,RDI,8)) //ldp in bytes + LEA(R11, MEM(RDI,RDI,2)) //ldp*3 + LEA(R12, MEM(RDI,RDI,4)) //ldp*5 + LEA(R13, MEM(R11,RDI,4)) //ldp*7 + + VBROADCASTSD(ZMM(31), VAR(kappa)) + + CMP(RBX, IMM(8)) + JNE(PACK8_T) + + LABEL(PACK8_N) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK8_N_TAIL) + + LEA(R8, MEM(RCX,RCX,2)) //lda*3 + LEA(R9, MEM(RCX,RCX,4)) //lda*5 + LEA(R10, MEM(R8 ,RCX,4)) //lda*7 + + LABEL(PACK8_N_LOOP) + + LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORE8x8(RBP,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7) + + LEA(RAX, MEM(RAX,RCX,8)) + LEA(RBP, MEM(RBP,RDI,8)) + + SUB(RSI, IMM(1)) + + JNZ(PACK8_N_LOOP) + + TEST(RDX, RDX) + JZ(PACK8_DONE) + + LABEL(PACK8_N_TAIL) + + VMULPD(ZMM(0), ZMM(31), MEM(RAX)) + VMOVUPD(MEM(RBP), ZMM(0)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(RBP, MEM(RBP,RDI,1)) + + SUB(RDX, IMM(1)) + + JNZ(PACK8_N_TAIL) + + JMP(PACK8_DONE) + + LABEL(PACK8_T) + + CMP(RCX, IMM(8)) + JNE(PACK8_G) + + LEA(R8, MEM(RBX,RBX,2)) //inca*3 + LEA(R9, MEM(RBX,RBX,4)) //inca*5 + LEA(R10, MEM(R8 ,RBX,4)) //inca*7 + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK8_T_TAIL) + + LABEL(PACK8_T_LOOP) + + LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) + TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, + 16,17,18,19,20,21,22,23) + STORE8x8(RBP,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + + LEA(RAX, MEM(RAX,RCX,8)) + LEA(RBP, MEM(RBP,RDI,8)) + + SUB(RSI, IMM(1)) + + JNZ(PACK8_T_LOOP) + + TEST(RDX, RDX) + JZ(PACK8_DONE) + + LABEL(PACK8_T_TAIL) + + MOV(RSI, IMM(1)) + SHLX(RSI, RSI, RDX) + SUB(RSI, IMM(1)) + KMOV(K(1), ESI) //mask for n%8 elements + + LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1) + TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9,10,11,12,13,14,15) + + VMOVUPD(MEM(RBP ), ZMM( 8)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(RBP,RDI,1), ZMM( 9)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(RBP,RDI,2), ZMM(10)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(RBP,R11,1), ZMM(11)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(RBP,RDI,4), ZMM(12)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(RBP,R12,1), ZMM(13)) + SUB(RDX, IMM(1)) + JZ(PACK8_DONE) + VMOVUPD(MEM(RBP,R11,2), ZMM(14)) + + JMP(PACK8_DONE) + + LABEL(PACK8_G) + + VPBROADCASTD(ZMM(3), VAR(inca)) + MOV(RBX, VAR(offsetPtr)) + VPMULLD(YMM(0), YMM(3), MEM(RBX)) + + LABEL(PACK8_G_LOOP) + + KXNORW(K(1), K(0), K(0)) + VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) + VMULPD(ZMM(3), ZMM(3), ZMM(31)) + VMOVUPD(MEM(RBP), ZMM(3)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(RBP, MEM(RBP,RDI,1)) + + SUB(RSI, IMM(1)) + + JNZ(PACK8_G_LOOP) + + LABEL(PACK8_DONE) + + : //output operands + : //input operands + [n] "m" (n), + [kappa] "m" (*kappa), + [a] "m" (a), + [inca] "m" (inca), + [lda] "m" (lda), + [p] "m" (p), + [ldp] "m" (ldp), + [offsetPtr] "m" (offsetPtr) + : //clobbers + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", + "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", + "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", + "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", + "zmm30", "zmm31", + "rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi", + "r8", "r9", "r10", "r11", "r12", "r13", "memory" + ); +} + +void bli_dpackm_24xk_opt + ( + conj_t conja, + dim_t n, + void* restrict kappa_, + void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp + ) +{ + const int32_t * offsetPtr = &offsets[0]; + double* a = (double*)a_; + double* p = (double*)p_; + double* kappa = (double*)kappa_; + + __asm__ volatile + ( + MOV(RSI, VAR(n)) + MOV(RAX, VAR(a)) + MOV(RBX, VAR(inca)) + MOV(RCX, VAR(lda)) + MOV(RBP, VAR(p)) + MOV(RDI, VAR(ldp)) + + LEA(RBX, MEM(,RBX,8)) //inca in bytes + LEA(RCX, MEM(,RCX,8)) //lda in bytes + LEA(RDI, MEM(,RDI,8)) //ldp in bytes + LEA(R11, MEM(RDI,RDI,2)) //ldp*3 + LEA(R12, MEM(RDI,RDI,4)) //ldp*5 + LEA(R13, MEM(R11,RDI,4)) //ldp*7 + + VBROADCASTSD(ZMM(31), VAR(kappa)) + + TEST(RSI, RSI) + JZ(PACK24_DONE) + + CMP(RBX, IMM(8)) + JNE(PACK24_T) + + LABEL(PACK24_N) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK24_N_TAIL) + + LEA(R8, MEM(RCX,RCX,2)) //lda*3 + LEA(R9, MEM(RCX,RCX,4)) //lda*5 + LEA(R10, MEM(R8 ,RCX,4)) //lda*7 + + LABEL(PACK24_N_LOOP) + + LOADMUL8x8(RAX, 0,RCX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) + LOADMUL8x8(RAX, 64,RCX,R8,R9,R10, 8, 9,10,11,12,13,14,15) + LOADMUL8x8(RAX,128,RCX,R8,R9,R10,16,17,18,19,20,21,22,23) + STORE8x8(RBP, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) + STORE8x8(RBP, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15) + STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + + LEA(RAX, MEM(RAX,RCX,8)) + LEA(RBP, MEM(RBP,RDI,8)) + + SUB(RSI, IMM(1)) + + JNZ(PACK24_N_LOOP) + + TEST(RDX, RDX) + JZ(PACK24_DONE) + + LABEL(PACK24_N_TAIL) + + VMULPD(ZMM(0), ZMM(31), MEM(RAX, 0)) + VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64)) + VMULPD(ZMM(2), ZMM(31), MEM(RAX,128)) + VMOVUPD(MEM(RBP, 0), ZMM(0)) + VMOVUPD(MEM(RBP, 64), ZMM(1)) + VMOVUPD(MEM(RBP,128), ZMM(2)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(RBP, MEM(RBP,RDI,1)) + + SUB(RDX, IMM(1)) + + JNZ(PACK24_N_TAIL) + + JMP(PACK24_DONE) + + LABEL(PACK24_T) + + CMP(RCX, IMM(8)) + JNE(PACK24_G) + + LEA(R8, MEM(RBX,RBX,2)) //inca*3 + LEA(R9, MEM(RBX,RBX,4)) //inca*5 + LEA(R10, MEM(R8 ,RBX,4)) //inca*7 + + LEA(R14, MEM(RAX,RBX,8)) + LEA(RCX, MEM(R14,RBX,8)) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK24_T_TAIL) + + LABEL(PACK24_T_LOOP) + + LOADMUL8x8(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) + LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15) + TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, + 16,17,18,19,20,21,22,23) + STORE8x8(RBP, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) + TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23) + STORE8x8(RBP, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, + 16,17,18,19,20,21,22,23) + STORE8x8(RBP,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) + + LEA(RAX, MEM(RAX,64)) + LEA(R14, MEM(R14,64)) + LEA(RCX, MEM(RCX,64)) + LEA(RBP, MEM(RBP,RDI,8)) + + SUB(RSI, IMM(1)) + + JNZ(PACK24_T_LOOP) + + TEST(RDX, RDX) + JZ(PACK24_DONE) + + LABEL(PACK24_T_TAIL) + + MOV(RSI, IMM(1)) + SHLX(RSI, RSI, RDX) + SUB(RSI, IMM(1)) + KMOV(K(1), ESI) //mask for n%8 elements + + LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1) + LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1) + LOADMUL8x8_MASK(RCX,0,RBX,R8,R9,R10,16,17,18,19,20,21,22,23,1) + TRANSPOSE8x8(16,17,18,19,20,21,22,23, + 24,25,26,27,28,29,30,31) + TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23) + TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9,10,11,12,13,14,15) + + VMOVUPD(MEM(RBP, 0), ZMM( 8)) + VMOVUPD(MEM(RBP, 64), ZMM(16)) + VMOVUPD(MEM(RBP, 128), ZMM(24)) + SUB(RDX, IMM(1)) + JZ(PACK24_DONE) + VMOVUPD(MEM(RBP,RDI,1, 0), ZMM( 9)) + VMOVUPD(MEM(RBP,RDI,1, 64), ZMM(17)) + VMOVUPD(MEM(RBP,RDI,1,128), ZMM(25)) + SUB(RDX, IMM(1)) + JZ(PACK24_DONE) + VMOVUPD(MEM(RBP,RDI,2, 0), ZMM(10)) + VMOVUPD(MEM(RBP,RDI,2, 64), ZMM(18)) + VMOVUPD(MEM(RBP,RDI,2,128), ZMM(26)) + SUB(RDX, IMM(1)) + JZ(PACK24_DONE) + VMOVUPD(MEM(RBP,R11,1, 0), ZMM(11)) + VMOVUPD(MEM(RBP,R11,1, 64), ZMM(19)) + VMOVUPD(MEM(RBP,R11,1,128), ZMM(27)) + SUB(RDX, IMM(1)) + JZ(PACK24_DONE) + VMOVUPD(MEM(RBP,RDI,4, 0), ZMM(12)) + VMOVUPD(MEM(RBP,RDI,4, 64), ZMM(20)) + VMOVUPD(MEM(RBP,RDI,4,128), ZMM(28)) + SUB(RDX, IMM(1)) + JZ(PACK24_DONE) + VMOVUPD(MEM(RBP,R12,1, 0), ZMM(13)) + VMOVUPD(MEM(RBP,R12,1, 64), ZMM(21)) + VMOVUPD(MEM(RBP,R12,1,128), ZMM(29)) + SUB(RDX, IMM(1)) + JZ(PACK24_DONE) + VMOVUPD(MEM(RBP,R11,2, 0), ZMM(14)) + VMOVUPD(MEM(RBP,R11,2, 64), ZMM(22)) + VMOVUPD(MEM(RBP,R11,2,128), ZMM(30)) + + JMP(PACK24_DONE) + + LABEL(PACK24_G) + + VPBROADCASTD(ZMM(3), VAR(inca)) + MOV(RBX, VAR(offsetPtr)) + VPMULLD(YMM(0), YMM(3), MEM(RBX, 0)) + VPMULLD(YMM(1), YMM(3), MEM(RBX,32)) + VPMULLD(YMM(2), YMM(3), MEM(RBX,64)) + + LABEL(PACK24_G_LOOP) + + KXNORW(K(1), K(0), K(0)) + KXNORW(K(2), K(0), K(0)) + KXNORW(K(3), K(0), K(0)) + VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) + VGATHERDPD(ZMM(4) MASK_K(2), MEM(RAX,YMM(1),8)) + VGATHERDPD(ZMM(5) MASK_K(3), MEM(RAX,YMM(2),8)) + VMULPD(ZMM(3), ZMM(3), ZMM(31)) + VMULPD(ZMM(4), ZMM(4), ZMM(31)) + VMULPD(ZMM(5), ZMM(5), ZMM(31)) + VMOVUPD(MEM(RBP, 0), ZMM(3)) + VMOVUPD(MEM(RBP, 64), ZMM(4)) + VMOVUPD(MEM(RBP,128), ZMM(5)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(RBP, MEM(RBP,RDI,1)) + + SUB(RSI, IMM(1)) + + JNZ(PACK24_G_LOOP) + + LABEL(PACK24_DONE) + + : //output operands + : //input operands + [n] "m" (n), + [kappa] "m" (*kappa), + [a] "m" (a), + [inca] "m" (inca), + [lda] "m" (lda), + [p] "m" (p), + [ldp] "m" (ldp), + [offsetPtr] "m" (offsetPtr) + : //clobbers + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", + "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", + "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", + "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", + "zmm30", "zmm31", + "rax", "rbx", "rcx", "rdx", "rbp", "rdi", "rsi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + ); +} diff --git a/kernels/x86_64/knl/3/bli_avx512_macros.h b/kernels/x86_64/knl/3/bli_avx512_macros.h index 6471c6cc6..7578b2502 100644 --- a/kernels/x86_64/knl/3/bli_avx512_macros.h +++ b/kernels/x86_64/knl/3/bli_avx512_macros.h @@ -19,12 +19,14 @@ #define EBX %%ebx #define ECX %%ecx #define EDX %%edx +#define EBP %%ebp #define EDI %%edi #define ESI %%esi #define RAX %%rax #define RBX %%rbx #define RCX %%rcx #define RDX %%rdx +#define RBP %%rbp #define RDI %%rdi #define RSI %%rsi #define K(x) %% k##x @@ -91,6 +93,7 @@ #define ADD(_0, _1) ASM(add _1, _0) #define SUB(_0, _1) ASM(sub _1, _0) #define SAL(_0, _1) ASM(sal _1, _0) +#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0) #define SAR(_0, _1) ASM(sar _1, _0) #define SAL1(_0) ASM(sal _0) #define SAR1(_0) ASM(sar _0) @@ -132,6 +135,9 @@ #define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0) #define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0) #define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0) +#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0) +#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0) +#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0) #define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS) #define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS) #define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)