Convert x86 microkernels to assembly macros.

This commit is contained in:
Devin Matthews
2018-06-20 14:07:24 -05:00
parent 42ea02a34e
commit b4d94e54d4
20 changed files with 18702 additions and 17651 deletions

View File

@@ -1,173 +0,0 @@
#ifndef BLIS_AVX512_MACROS_H
#define BLIS_AVX512_MACROS_H
//
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
//
#define COMMENT_BEGIN "#"
#define COMMENT_END
#define STRINGIFY(...) #__VA_ARGS__
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
#define LABEL(label) STRINGIFY(label) ":\n\t"
#define XMM(x) %%xmm##x
#define YMM(x) %%ymm##x
#define ZMM(x) %%zmm##x
#define EAX %%eax
#define EBX %%ebx
#define ECX %%ecx
#define EDX %%edx
#define EBP %%ebp
#define EDI %%edi
#define ESI %%esi
#define RAX %%rax
#define RBX %%rbx
#define RCX %%rcx
#define RDX %%rdx
#define RBP %%rbp
#define RDI %%rdi
#define RSI %%rsi
#define K(x) %%k##x
#define R(x) %%r##x
#define R8 %%r8
#define R9 %%r9
#define R10 %%r10
#define R11 %%r11
#define R12 %%r12
#define R13 %%r13
#define R14 %%r14
#define R15 %%r15
#define RD(x) %%r##x##d
#define R8D %%r8d
#define R9D %%r9d
#define R10D %%r10d
#define R11D %%r11d
#define R12D %%r12d
#define R13D %%r13d
#define R14D %%r14d
#define R15D %%r15d
#define IMM(x) $##x
#define VAR(x) %[x]
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
#define MEM_3(reg,off,scale) (reg,off,scale)
#define MEM_2(reg,disp) disp(reg)
#define MEM_1(reg) (reg)
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
#define MASK_K(n) %{%%k##n%}
#define MASK_KZ(n) %{%%k##n%}%{z%}
#define KMOV(to,from) ASM(kmovw from, to)
#define JKNZD(kreg,label) \
ASM(kortestw kreg, kreg) \
ASM(jnz label)
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
#define ALIGN16 ASM(.p2align 4)
#define ALIGN32 ASM(.p2align 5)
#define RDTSC ASM(rdstc)
#define MOV(_0, _1) ASM(mov _1, _0)
#define MOVD(_0, _1) ASM(movd _1, _0)
#define MOVL(_0, _1) ASM(movl _1, _0)
#define MOVQ(_0, _1) ASM(movq _1, _0)
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
#define CMP(_0, _1) ASM(cmp _1, _0)
#define AND(_0, _1) ASM(and _1, _0)
#define ADD(_0, _1) ASM(add _1, _0)
#define SUB(_0, _1) ASM(sub _1, _0)
#define SAL(_0, _1) ASM(sal _1, _0)
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
#define SAR(_0, _1) ASM(sar _1, _0)
#define SAL1(_0) ASM(sal _0)
#define SAR1(_0) ASM(sar _0)
#define LEA(_0, _1) ASM(lea _1, _0)
#define TEST(_0, _1) ASM(test _1, _0)
#define DEC(_0) ASM(dec _0)
#define JLE(_0) ASM(jle _0)
#define JL(_0) ASM(jl _0)
#define JNZ(_0) ASM(jnz _0)
#define JZ(_0) ASM(jz _0)
#define JNE(_0) ASM(jne _0)
#define JE(_0) ASM(je _0)
#define JNC(_0) ASM(jnc _0)
#define JC(_0) ASM(jc _0)
#define JMP(_0) ASM(jmp _0)
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0)
#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0)
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
#define VZEROUPPER() ASM(vzeroupper)
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -32,9 +32,11 @@
*/ */
#include "bli_avx512_macros.h"
#include "blis.h" #include "blis.h"
#define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \ #define LOADMUL8x8(a,o,s1,s3,s5,s7, \
z0,z1,z2,z3,z4,z5,z6,z7) \ z0,z1,z2,z3,z4,z5,z6,z7) \
\ \
@@ -125,156 +127,157 @@ void bli_dpackm_knl_asm_8xk
const int64_t lda = lda_; const int64_t lda = lda_;
const int64_t ldp = ldp_; const int64_t ldp = ldp_;
__asm__ volatile BEGIN_ASM
(
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
MOV(RCX, VAR(lda))
MOV(R14, VAR(p))
MOV(RDI, VAR(ldp))
TEST(RSI, RSI) MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
MOV(RCX, VAR(lda))
MOV(R14, VAR(p))
MOV(RDI, VAR(ldp))
TEST(RSI, RSI)
JZ(PACK8_DONE)
LEA(RBX, MEM(,RBX,8)) //inca in bytes
LEA(RCX, MEM(,RCX,8)) //lda in bytes
LEA(RDI, MEM(,RDI,8)) //ldp in bytes
LEA(R11, MEM(RDI,RDI,2)) //ldp*3
LEA(R12, MEM(RDI,RDI,4)) //ldp*5
LEA(R13, MEM(R11,RDI,4)) //ldp*7
VBROADCASTSD(ZMM(31), VAR(kappa))
CMP(RBX, IMM(8))
JNE(PACK8_T)
LABEL(PACK8_N)
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_N_TAIL)
LEA(R8, MEM(RCX,RCX,2)) //lda*3
LEA(R9, MEM(RCX,RCX,4)) //lda*5
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
LABEL(PACK8_N_LOOP)
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,RDI,8))
SUB(RSI, IMM(1))
JNZ(PACK8_N_LOOP)
TEST(RDX, RDX)
JZ(PACK8_DONE) JZ(PACK8_DONE)
LEA(RBX, MEM(,RBX,8)) //inca in bytes LABEL(PACK8_N_TAIL)
LEA(RCX, MEM(,RCX,8)) //lda in bytes
LEA(RDI, MEM(,RDI,8)) //ldp in bytes
LEA(R11, MEM(RDI,RDI,2)) //ldp*3
LEA(R12, MEM(RDI,RDI,4)) //ldp*5
LEA(R13, MEM(R11,RDI,4)) //ldp*7
VBROADCASTSD(ZMM(31), VAR(kappa)) VMULPD(ZMM(0), ZMM(31), MEM(RAX))
VMOVUPD(MEM(R14), ZMM(0))
CMP(RBX, IMM(8)) LEA(RAX, MEM(RAX,RCX,1))
JNE(PACK8_T) LEA(R14, MEM(R14,RDI,1))
LABEL(PACK8_N) SUB(RDX, IMM(1))
MOV(RDX, RSI) JNZ(PACK8_N_TAIL)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_N_TAIL)
LEA(R8, MEM(RCX,RCX,2)) //lda*3 JMP(PACK8_DONE)
LEA(R9, MEM(RCX,RCX,4)) //lda*5
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
LABEL(PACK8_N_LOOP) LABEL(PACK8_T)
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) CMP(RCX, IMM(8))
STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7) JNE(PACK8_G)
LEA(RAX, MEM(RAX,RCX,8)) LEA(R8, MEM(RBX,RBX,2)) //inca*3
LEA(R14, MEM(R14,RDI,8)) LEA(R9, MEM(RBX,RBX,4)) //inca*5
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
SUB(RSI, IMM(1)) MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_T_TAIL)
JNZ(PACK8_N_LOOP) LABEL(PACK8_T_LOOP)
TEST(RDX, RDX) LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
JZ(PACK8_DONE)
LABEL(PACK8_N_TAIL)
VMULPD(ZMM(0), ZMM(31), MEM(RAX))
VMOVUPD(MEM(R14), ZMM(0))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
SUB(RDX, IMM(1))
JNZ(PACK8_N_TAIL)
JMP(PACK8_DONE)
LABEL(PACK8_T)
CMP(RCX, IMM(8))
JNE(PACK8_G)
LEA(R8, MEM(RBX,RBX,2)) //inca*3
LEA(R9, MEM(RBX,RBX,4)) //inca*5
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_T_TAIL)
LABEL(PACK8_T_LOOP)
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
16,17,18,19,20,21,22,23)
STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,RDI,8))
SUB(RSI, IMM(1))
JNZ(PACK8_T_LOOP)
TEST(RDX, RDX)
JZ(PACK8_DONE)
LABEL(PACK8_T_TAIL)
MOV(RSI, IMM(1))
SHLX(RSI, RSI, RDX)
SUB(RSI, IMM(1))
KMOV(K(1), ESI) //mask for n%8 elements
LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
8, 9,10,11,12,13,14,15) 16,17,18,19,20,21,22,23)
STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
VMOVUPD(MEM(R14 ), ZMM( 8)) LEA(RAX, MEM(RAX,RCX,8))
SUB(RDX, IMM(1)) LEA(R14, MEM(R14,RDI,8))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,2), ZMM(10))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,1), ZMM(11))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,4), ZMM(12))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R12,1), ZMM(13))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,2), ZMM(14))
JMP(PACK8_DONE) SUB(RSI, IMM(1))
LABEL(PACK8_G) JNZ(PACK8_T_LOOP)
VPBROADCASTD(ZMM(3), VAR(inca)) TEST(RDX, RDX)
MOV(RBX, VAR(offsetPtr)) JZ(PACK8_DONE)
VPMULLD(YMM(0), YMM(3), MEM(RBX))
LABEL(PACK8_G_LOOP) LABEL(PACK8_T_TAIL)
KXNORW(K(1), K(0), K(0)) MOV(RSI, IMM(1))
VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) SHLX(RSI, RSI, RDX)
VMULPD(ZMM(3), ZMM(3), ZMM(31)) SUB(RSI, IMM(1))
VMOVUPD(MEM(R14), ZMM(3)) KMOVW(K(1), ESI) //mask for n%8 elements
LEA(RAX, MEM(RAX,RCX,1)) LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
LEA(R14, MEM(R14,RDI,1)) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
8, 9,10,11,12,13,14,15)
SUB(RSI, IMM(1)) VMOVUPD(MEM(R14 ), ZMM( 8))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,2), ZMM(10))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,1), ZMM(11))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,4), ZMM(12))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R12,1), ZMM(13))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,2), ZMM(14))
JNZ(PACK8_G_LOOP) JMP(PACK8_DONE)
LABEL(PACK8_DONE) LABEL(PACK8_G)
VPBROADCASTD(ZMM(3), VAR(inca))
MOV(RBX, VAR(offsetPtr))
VPMULLD(YMM(0), YMM(3), MEM(RBX))
LABEL(PACK8_G_LOOP)
KXNORW(K(1), K(0), K(0))
VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
VMULPD(ZMM(3), ZMM(3), ZMM(31))
VMOVUPD(MEM(R14), ZMM(3))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
SUB(RSI, IMM(1))
JNZ(PACK8_G_LOOP)
LABEL(PACK8_DONE)
END_ASM(
: //output operands : //output operands
: //input operands : //input operands
[n] "m" (n), [n] "m" (n),
@@ -294,7 +297,7 @@ void bli_dpackm_knl_asm_8xk
"zmm30", "zmm31", "zmm30", "zmm31",
"rax", "rbx", "rcx", "rdx", "rdi", "rsi", "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
); )
} }
void bli_dpackm_knl_asm_24xk void bli_dpackm_knl_asm_24xk
@@ -441,7 +444,7 @@ void bli_dpackm_knl_asm_24xk
MOV(R13, IMM(1)) MOV(R13, IMM(1))
SHLX(R13, R13, RSI) SHLX(R13, R13, RSI)
SUB(R13, IMM(1)) SUB(R13, IMM(1))
KMOV(K(1), R13D) //mask for n%8 elements KMOVW(K(1), R13D) //mask for n%8 elements
LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1) LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1)
LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1) LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1)

View File

@@ -32,10 +32,10 @@
*/ */
#include "bli_avx512_macros.h"
#include "blis.h" #include "blis.h"
#include <stdio.h> #define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \ #define LOADMUL8x8(a,o,s1,s3,s5,s7, \
z0,z1,z2,z3,z4,z5,z6,z7) \ z0,z1,z2,z3,z4,z5,z6,z7) \

View File

@@ -35,7 +35,8 @@
#include "blis.h" #include "blis.h"
#include <assert.h> #include <assert.h>
#include "bli_avx512_macros.h" #define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define UNROLL_K 32 #define UNROLL_K 32
@@ -212,8 +213,8 @@ void bli_dgemm_knl_asm_24x8
int tlooph, tloopl, blooph, bloopl; int tlooph, tloopl, blooph, bloopl;
#endif #endif
__asm__ volatile BEGIN_ASM
(
#ifdef MONITORS #ifdef MONITORS
RDTSC RDTSC
MOV(VAR(topl), EAX) MOV(VAR(topl), EAX)
@@ -380,7 +381,7 @@ void bli_dgemm_knl_asm_24x8
JNZ(MAIN_LOOP) JNZ(MAIN_LOOP)
LABEL(REM_1) LABEL(REM_1)
SAR1(RDI) SAR(RDI)
JNC(REM_2) JNC(REM_2)
SUBITER(0,1,0,RAX) SUBITER(0,1,0,RAX)
@@ -389,7 +390,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM( 8*8)) ADD(RBX, IMM( 8*8))
LABEL(REM_2) LABEL(REM_2)
SAR1(RDI) SAR(RDI)
JNC(REM_4) JNC(REM_4)
SUBITER(0,1,0,RAX) SUBITER(0,1,0,RAX)
@@ -398,7 +399,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM(2* 8*8)) ADD(RBX, IMM(2* 8*8))
LABEL(REM_4) LABEL(REM_4)
SAR1(RDI) SAR(RDI)
JNC(REM_8) JNC(REM_8)
SUBITER(0,1,0,RAX) SUBITER(0,1,0,RAX)
@@ -409,7 +410,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM(4* 8*8)) ADD(RBX, IMM(4* 8*8))
LABEL(REM_8) LABEL(REM_8)
SAR1(RDI) SAR(RDI)
JNC(REM_16) JNC(REM_16)
SUBITER(0,1,0,RAX ) SUBITER(0,1,0,RAX )
@@ -424,7 +425,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM(8* 8*8)) ADD(RBX, IMM(8* 8*8))
LABEL(REM_16) LABEL(REM_16)
SAR1(RDI) SAR(RDI)
JNC(AFTER_LOOP) JNC(AFTER_LOOP)
SUBITER( 0,1,0,RAX ) SUBITER( 0,1,0,RAX )
@@ -570,7 +571,7 @@ void bli_dgemm_knl_asm_24x8
JNE(SCATTEREDUPDATE) JNE(SCATTEREDUPDATE)
VMOVQ(RDX, XMM(1)) VMOVQ(RDX, XMM(1))
SAL1(RDX) //shift out sign bit SAL(RDX) //shift out sign bit
JZ(COLSTORBZ) JZ(COLSTORBZ)
UPDATE_C_FOUR_ROWS( 8, 9,10,11) UPDATE_C_FOUR_ROWS( 8, 9,10,11)
@@ -602,7 +603,7 @@ void bli_dgemm_knl_asm_24x8
VPMULLD(ZMM(2), ZMM(3), ZMM(2)) VPMULLD(ZMM(2), ZMM(3), ZMM(2))
VMOVQ(RDX, XMM(1)) VMOVQ(RDX, XMM(1))
SAL1(RDX) //shift out sign bit SAL(RDX) //shift out sign bit
JZ(SCATTERBZ) JZ(SCATTERBZ)
UPDATE_C_ROW_SCATTERED( 8) UPDATE_C_ROW_SCATTERED( 8)
@@ -666,6 +667,8 @@ void bli_dgemm_knl_asm_24x8
MOV(VAR(botl), EAX) MOV(VAR(botl), EAX)
MOV(VAR(both), EDX) MOV(VAR(both), EDX)
#endif #endif
END_ASM(
: // output operands : // output operands
#ifdef MONITORS #ifdef MONITORS
[topl] "=m" (topl), [topl] "=m" (topl),
@@ -696,7 +699,7 @@ void bli_dgemm_knl_asm_24x8
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory" "zmm30", "zmm31", "memory"
); )
#ifdef LOOPMON #ifdef LOOPMON
printf("looptime = \t%d\n", bloopl - tloopl); printf("looptime = \t%d\n", bloopl - tloopl);

View File

@@ -35,7 +35,8 @@
#include "blis.h" #include "blis.h"
#include <assert.h> #include <assert.h>
#include "bli_avx512_macros.h" #define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define UNROLL_K 32 #define UNROLL_K 32
@@ -377,7 +378,7 @@ void bli_sgemm_knl_asm_24x16
JNZ(MAIN_LOOP) JNZ(MAIN_LOOP)
LABEL(REM_1) LABEL(REM_1)
SAR1(RDI) SAR(RDI)
JNC(REM_2) JNC(REM_2)
SUBITER(0,1,0,RAX) SUBITER(0,1,0,RAX)
@@ -386,7 +387,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(16*4)) ADD(RBX, IMM(16*4))
LABEL(REM_2) LABEL(REM_2)
SAR1(RDI) SAR(RDI)
JNC(REM_4) JNC(REM_4)
SUBITER(0,1,0,RAX) SUBITER(0,1,0,RAX)
@@ -395,7 +396,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(2*16*4)) ADD(RBX, IMM(2*16*4))
LABEL(REM_4) LABEL(REM_4)
SAR1(RDI) SAR(RDI)
JNC(REM_8) JNC(REM_8)
SUBITER(0,1,0,RAX) SUBITER(0,1,0,RAX)
@@ -406,7 +407,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(4*16*4)) ADD(RBX, IMM(4*16*4))
LABEL(REM_8) LABEL(REM_8)
SAR1(RDI) SAR(RDI)
JNC(REM_16) JNC(REM_16)
SUBITER(0,1,0,RAX ) SUBITER(0,1,0,RAX )
@@ -421,7 +422,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(8*16*4)) ADD(RBX, IMM(8*16*4))
LABEL(REM_16) LABEL(REM_16)
SAR1(RDI) SAR(RDI)
JNC(AFTER_LOOP) JNC(AFTER_LOOP)
SUBITER( 0,1,0,RAX ) SUBITER( 0,1,0,RAX )
@@ -567,7 +568,7 @@ void bli_sgemm_knl_asm_24x16
JNE(SCATTEREDUPDATE) JNE(SCATTEREDUPDATE)
VMOVD(EDX, XMM(1)) VMOVD(EDX, XMM(1))
SAL1(EDX) //shift out sign bit SAL(EDX) //shift out sign bit
JZ(COLSTORBZ) JZ(COLSTORBZ)
UPDATE_C_FOUR_ROWS( 8, 9,10,11) UPDATE_C_FOUR_ROWS( 8, 9,10,11)
@@ -599,7 +600,7 @@ void bli_sgemm_knl_asm_24x16
VPMULLD(ZMM(2), ZMM(3), ZMM(2)) VPMULLD(ZMM(2), ZMM(3), ZMM(2))
VMOVD(EDX, XMM(1)) VMOVD(EDX, XMM(1))
SAL1(EDX) //shift out sign bit SAL(EDX) //shift out sign bit
JZ(SCATTERBZ) JZ(SCATTERBZ)
UPDATE_C_ROW_SCATTERED( 8) UPDATE_C_ROW_SCATTERED( 8)

File diff suppressed because it is too large Load Diff

View File

@@ -34,6 +34,9 @@
#include "blis.h" #include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0 #if 0
void bli_sgemmtrsm_l_penryn_asm_8x4 void bli_sgemmtrsm_l_penryn_asm_8x4
( (
@@ -75,446 +78,446 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
__asm__ volatile __asm__ volatile
( (
" \n\t"
"movq %2, %%rax \n\t" // load address of a10. mov(%2, rax) // load address of a10.
"movq %4, %%rbx \n\t" // load address of b01. mov(%4, rbx) // load address of b01.
//"movq %10, %%r9 \n\t" // load address of b_next. //mov(%10, r9) // load address of b_next.
" \n\t"
"subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte sub(imm(0-8*16), rax) // increment pointers to allow byte
"subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
" \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b. movaps(mem(rax, -7*16), xmm1) // of a and b.
"movaps -8 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -8*16), xmm2)
" \n\t"
//"movq %6, %%rcx \n\t" // load address of c11 //mov(%6, rcx) // load address of c11
//"movq %9, %%rdi \n\t" // load cs_c //mov(%9, rdi) // load cs_c
//"leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) //lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
//"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // load address of c + 2*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c;
" \n\t"
//"prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next //prefetch(2, mem(r9, 0*8)) // prefetch b_next
" \n\t"
"xorpd %%xmm3, %%xmm3 \n\t" xorpd(xmm3, xmm3)
"xorpd %%xmm4, %%xmm4 \n\t" xorpd(xmm4, xmm4)
"xorpd %%xmm5, %%xmm5 \n\t" xorpd(xmm5, xmm5)
"xorpd %%xmm6, %%xmm6 \n\t" xorpd(xmm6, xmm6)
" \n\t"
//"prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c //prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c
"xorpd %%xmm8, %%xmm8 \n\t" xorpd(xmm8, xmm8)
"movaps %%xmm8, %%xmm9 \n\t" movaps(xmm8, xmm9)
//"prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c //prefetch(2, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
"movaps %%xmm8, %%xmm10 \n\t" movaps(xmm8, xmm10)
"movaps %%xmm8, %%xmm11 \n\t" movaps(xmm8, xmm11)
//"prefetcht2 3 * 8(%%rdx) \n\t" // prefetch c + 2*cs_c //prefetch(2, mem(rdx, 3*8)) // prefetch c + 2*cs_c
"movaps %%xmm8, %%xmm12 \n\t" movaps(xmm8, xmm12)
"movaps %%xmm8, %%xmm13 \n\t" movaps(xmm8, xmm13)
//"prefetcht2 3 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 3*cs_c //prefetch(2, mem(rdx, rdi, 1, 3*8)) // prefetch c + 3*cs_c
"movaps %%xmm8, %%xmm14 \n\t" movaps(xmm8, xmm14)
"movaps %%xmm8, %%xmm15 \n\t" movaps(xmm8, xmm15)
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rsi \n\t" // i = k_iter; mov(%0, rsi) // i = k_iter;
"testq %%rsi, %%rsi \n\t" // check i via logical AND. test(rsi, rsi) // check i via logical AND.
"je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that je(.CONSIDERKLEFT) // if i == 0, jump to code that
" \n\t" // contains the k_left loop. // contains the k_left loop.
" \n\t"
" \n\t"
".LOOPKITER: \n\t" // MAIN LOOP label(.LOOPKITER) // MAIN LOOP
" \n\t"
//"prefetcht0 1264(%%rax) \n\t" //prefetch(0, mem(rax, 1264))
"prefetcht0 (4*35+1) * 8(%%rax) \n\t" prefetch(0, mem(4*35+1)*8(rax))
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0 addpd(xmm3, xmm11) // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -7*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -6 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -6*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -6 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -6*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -5 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -5*16), xmm1)
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 1 addpd(xmm3, xmm11) // iteration 1
"movaps -5 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -5*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -4 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -4*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -4 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -4*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -3 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -3*16), xmm1)
" \n\t"
//"prefetcht0 1328(%%rax) \n\t" //prefetch(0, mem(rax, 1328))
"prefetcht0 (4*37+1) * 8(%%rax) \n\t" prefetch(0, mem(4*37+1)*8(rax))
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 2 addpd(xmm3, xmm11) // iteration 2
"movaps -3 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -3*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -2 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -2*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -2 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -2*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -1 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -1*16), xmm1)
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 3 addpd(xmm3, xmm11) // iteration 3
"movaps -1 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -1*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"subq $-4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
//"subq $-4 * 4 * 8, %%r9 \n\t" // b_next += 4*4 (unroll x nr) //sub(imm(-4*4*8), r9) // b_next += 4*4 (unroll x nr)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps 0 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, 0*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"subq $-4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -8 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -8*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -7 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -7*16), xmm1)
" \n\t"
//"prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next[0] //prefetch(2, mem(r9, 0*8)) // prefetch b_next[0]
//"prefetcht2 8 * 8(%%r9) \n\t" // prefetch b_next[8] //prefetch(2, mem(r9, 8*8)) // prefetch b_next[8]
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1; dec(rsi) // i -= 1;
"jne .LOOPKITER \n\t" // iterate again if i != 0. jne(.LOOPKITER) // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".CONSIDERKLEFT: \n\t" label(.CONSIDERKLEFT)
" \n\t"
"movq %1, %%rsi \n\t" // i = k_left; mov(%1, rsi) // i = k_left;
"testq %%rsi, %%rsi \n\t" // check i via logical AND. test(rsi, rsi) // check i via logical AND.
"je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. je(.POSTACCUM) // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter k_left loop. // else, we prepare to enter k_left loop.
" \n\t"
" \n\t"
".LOOPKLEFT: \n\t" // EDGE LOOP label(.LOOPKLEFT) // EDGE LOOP
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0 addpd(xmm3, xmm11) // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -7*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -6 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -6*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -6 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -6*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -5 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -5*16), xmm1)
" \n\t"
" \n\t"
"subq $-4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr) sub(imm(0-4*1*8), rax) // a += 4 (1 x mr)
"subq $-4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr)
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1; dec(rsi) // i -= 1;
"jne .LOOPKLEFT \n\t" // iterate again if i != 0. jne(.LOOPKLEFT) // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".POSTACCUM: \n\t" label(.POSTACCUM)
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" addpd(xmm3, xmm11)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
" \n\t"
" \n\t"
" \n\t"
"movq %5, %%rbx \n\t" // load address of b11. mov(%5, rbx) // load address of b11.
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11: // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02 // ( ab01 ( ab00 ( ab03 ( ab02
" \n\t" // ab10 ) ab11 ) ab12 ) ab13 ) // ab10 ) ab11 ) ab12 ) ab13 )
" \n\t" // //
" \n\t" // xmm12: xmm13: xmm14: xmm15: // xmm12: xmm13: xmm14: xmm15:
" \n\t" // ( ab21 ( ab20 ( ab23 ( ab22 // ( ab21 ( ab20 ( ab23 ( ab22
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) // ab30 ) ab31 ) ab32 ) ab33 )
"movaps %%xmm9, %%xmm0 \n\t" movaps(xmm9, xmm0)
"movaps %%xmm8, %%xmm1 \n\t" movaps(xmm8, xmm1)
"unpcklpd %%xmm8, %%xmm0 \n\t" unpcklpd(xmm8, xmm0)
"unpckhpd %%xmm9, %%xmm1 \n\t" unpckhpd(xmm9, xmm1)
" \n\t"
"movaps %%xmm11, %%xmm4 \n\t" movaps(xmm11, xmm4)
"movaps %%xmm10, %%xmm5 \n\t" movaps(xmm10, xmm5)
"unpcklpd %%xmm10, %%xmm4 \n\t" unpcklpd(xmm10, xmm4)
"unpckhpd %%xmm11, %%xmm5 \n\t" unpckhpd(xmm11, xmm5)
" \n\t"
"movaps %%xmm13, %%xmm2 \n\t" movaps(xmm13, xmm2)
"movaps %%xmm12, %%xmm3 \n\t" movaps(xmm12, xmm3)
"unpcklpd %%xmm12, %%xmm2 \n\t" unpcklpd(xmm12, xmm2)
"unpckhpd %%xmm13, %%xmm3 \n\t" unpckhpd(xmm13, xmm3)
" \n\t"
"movaps %%xmm15, %%xmm6 \n\t" movaps(xmm15, xmm6)
"movaps %%xmm14, %%xmm7 \n\t" movaps(xmm14, xmm7)
"unpcklpd %%xmm14, %%xmm6 \n\t" unpcklpd(xmm14, xmm6)
"unpckhpd %%xmm15, %%xmm7 \n\t" unpckhpd(xmm15, xmm7)
" \n\t"
" \n\t" // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
" \n\t" // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
" \n\t"
"movq %9, %%rax \n\t" // load address of alpha mov(%9, rax) // load address of alpha
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate movddup(mem(rax), xmm15) // load alpha and duplicate
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t" movaps(mem(rbx, 0*16), xmm8)
"movaps 1 * 16(%%rbx), %%xmm12 \n\t" movaps(mem(rbx, 1*16), xmm12)
"mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 ) mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 )
"mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 ) mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t" movaps(mem(rbx, 2*16), xmm9)
"movaps 3 * 16(%%rbx), %%xmm13 \n\t" movaps(mem(rbx, 3*16), xmm13)
"mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 ) mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 )
"mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 ) mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t" movaps(mem(rbx, 4*16), xmm10)
"movaps 5 * 16(%%rbx), %%xmm14 \n\t" movaps(mem(rbx, 5*16), xmm14)
"mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 ) mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 )
"mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 ) mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t" movaps(mem(rbx, 6*16), xmm11)
"mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 ) mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
"mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 ) mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
" \n\t"
" \n\t" // (Now scaled by alpha:) // (Now scaled by alpha:)
" \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 )
" \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 )
" \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
" \n\t" // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
" \n\t"
"subpd %%xmm0, %%xmm8 \n\t" // xmm8 -= xmm0 subpd(xmm0, xmm8) // xmm8 -= xmm0
"subpd %%xmm1, %%xmm9 \n\t" // xmm9 -= xmm1 subpd(xmm1, xmm9) // xmm9 -= xmm1
"subpd %%xmm2, %%xmm10 \n\t" // xmm10 -= xmm2 subpd(xmm2, xmm10) // xmm10 -= xmm2
"subpd %%xmm3, %%xmm11 \n\t" // xmm11 -= xmm3 subpd(xmm3, xmm11) // xmm11 -= xmm3
"subpd %%xmm4, %%xmm12 \n\t" // xmm12 -= xmm4 subpd(xmm4, xmm12) // xmm12 -= xmm4
"subpd %%xmm5, %%xmm13 \n\t" // xmm13 -= xmm5 subpd(xmm5, xmm13) // xmm13 -= xmm5
"subpd %%xmm6, %%xmm14 \n\t" // xmm14 -= xmm6 subpd(xmm6, xmm14) // xmm14 -= xmm6
"subpd %%xmm7, %%xmm15 \n\t" // xmm15 -= xmm7 subpd(xmm7, xmm15) // xmm15 -= xmm7
" \n\t"
" \n\t"
" \n\t"
".TRSM: \n\t" label(.TRSM)
" \n\t"
" \n\t"
"movq %3, %%rax \n\t" // load address of a11 mov(%3, rax) // load address of a11
"movq %6, %%rcx \n\t" // load address of c11 mov(%6, rcx) // load address of c11
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c mov(%7, rsi) // load rs_c
"movq %8, %%rdi \n\t" // load cs_c mov(%8, rdi) // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) sal(imm(3), rsi) // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double )
" \n\t"
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0 // iteration 0
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
" \n\t"
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c add(rsi, rcx) // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c add(rsi, rdx) // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1 // iteration 1
" \n\t"
"movddup (1+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha10 movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 movaps(xmm0, xmm4) // xmm4 = xmm0
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 ) mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha10 * ( beta02 beta03 ) mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
"subpd %%xmm0, %%xmm9 \n\t" // xmm9 -= xmm0 subpd(xmm0, xmm9) // xmm9 -= xmm0
"subpd %%xmm4, %%xmm13 \n\t" // xmm13 -= xmm4 subpd(xmm4, xmm13) // xmm13 -= xmm4
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c add(rsi, rcx) // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c add(rsi, rdx) // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2 // iteration 2
" \n\t"
"movddup (2+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha20 movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
"movddup (2+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha21 movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 movaps(xmm0, xmm4) // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 movaps(xmm1, xmm5) // xmm5 = xmm1
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 ) mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha20 * ( beta02 beta03 ) mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 ) mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha21 * ( beta12 beta13 ) mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; addpd(xmm1, xmm0) // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; addpd(xmm5, xmm4) // xmm4 += xmm5;
"subpd %%xmm0, %%xmm10 \n\t" // xmm10 -= xmm0 subpd(xmm0, xmm10) // xmm10 -= xmm0
"subpd %%xmm4, %%xmm14 \n\t" // xmm14 -= xmm4 subpd(xmm4, xmm14) // xmm14 -= xmm4
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c add(rsi, rcx) // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c add(rsi, rdx) // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3 // iteration 3
" \n\t"
"movddup (3+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha30 movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
"movddup (3+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha31 movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
"movddup (3+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha32 movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 movaps(xmm0, xmm4) // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 movaps(xmm1, xmm5) // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 movaps(xmm2, xmm6) // xmm6 = xmm2
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 ) mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha30 * ( beta02 beta03 ) mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 ) mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha31 * ( beta12 beta13 ) mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 ) mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha32 * ( beta22 beta23 ) mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; addpd(xmm1, xmm0) // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; addpd(xmm5, xmm4) // xmm4 += xmm5;
"addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2; addpd(xmm2, xmm0) // xmm0 += xmm2;
"addpd %%xmm6, %%xmm4 \n\t" // xmm4 += xmm6; addpd(xmm6, xmm4) // xmm4 += xmm6;
"subpd %%xmm0, %%xmm11 \n\t" // xmm11 -= xmm0 subpd(xmm0, xmm11) // xmm11 -= xmm0
"subpd %%xmm4, %%xmm15 \n\t" // xmm15 -= xmm4 subpd(xmm4, xmm15) // xmm15 -= xmm4
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
" \n\t"
" \n\t"
" \n\t"
: // output operands (none) : // output operands (none)
: // input operands : // input operands
@@ -540,3 +543,4 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
} }

View File

@@ -34,6 +34,9 @@
#include "blis.h" #include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0 #if 0
void bli_sgemmtrsm_u_penryn_asm_8x4 void bli_sgemmtrsm_u_penryn_asm_8x4
( (
@@ -75,432 +78,432 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
__asm__ volatile __asm__ volatile
( (
" \n\t"
"movq %2, %%rax \n\t" // load address of a12. mov(%2, rax) // load address of a12.
"movq %4, %%rbx \n\t" // load address of b21. mov(%4, rbx) // load address of b21.
//"movq %10, %%r9 \n\t" // load address of b_next. //mov(%10, r9) // load address of b_next.
" \n\t"
"addq $8 * 16, %%rax \n\t" // increment pointers to allow byte add(imm(8*16), rax) // increment pointers to allow byte
"addq $8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. add(imm(8*16), rbx) // offsets in the unrolled iterations.
" \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b. movaps(mem(rax, -7*16), xmm1) // of a and b.
"movaps -8 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -8*16), xmm2)
" \n\t"
"xorpd %%xmm3, %%xmm3 \n\t" xorpd(xmm3, xmm3)
"xorpd %%xmm4, %%xmm4 \n\t" xorpd(xmm4, xmm4)
"xorpd %%xmm5, %%xmm5 \n\t" xorpd(xmm5, xmm5)
"xorpd %%xmm6, %%xmm6 \n\t" xorpd(xmm6, xmm6)
" \n\t"
"xorpd %%xmm8, %%xmm8 \n\t" xorpd(xmm8, xmm8)
"movaps %%xmm8, %%xmm9 \n\t" movaps(xmm8, xmm9)
"movaps %%xmm8, %%xmm10 \n\t" movaps(xmm8, xmm10)
"movaps %%xmm8, %%xmm11 \n\t" movaps(xmm8, xmm11)
"movaps %%xmm8, %%xmm12 \n\t" movaps(xmm8, xmm12)
"movaps %%xmm8, %%xmm13 \n\t" movaps(xmm8, xmm13)
"movaps %%xmm8, %%xmm14 \n\t" movaps(xmm8, xmm14)
"movaps %%xmm8, %%xmm15 \n\t" movaps(xmm8, xmm15)
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rsi \n\t" // i = k_iter; mov(%0, rsi) // i = k_iter;
"testq %%rsi, %%rsi \n\t" // check i via logical AND. test(rsi, rsi) // check i via logical AND.
"je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that je(.CONSIDERKLEFT) // if i == 0, jump to code that
" \n\t" // contains the k_left loop. // contains the k_left loop.
" \n\t"
" \n\t"
".LOOPKITER: \n\t" // MAIN LOOP label(.LOOPKITER) // MAIN LOOP
" \n\t"
"prefetcht0 1264(%%rax) \n\t" prefetch(0, mem(rax, 1264))
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0 addpd(xmm3, xmm11) // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -7*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -6 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -6*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -6 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -6*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -5 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -5*16), xmm1)
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 1 addpd(xmm3, xmm11) // iteration 1
"movaps -5 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -5*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -4 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -4*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -4 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -4*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -3 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -3*16), xmm1)
" \n\t"
"prefetcht0 1328(%%rax) \n\t" prefetch(0, mem(rax, 1328))
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 2 addpd(xmm3, xmm11) // iteration 2
"movaps -3 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -3*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -2 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -2*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -2 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -2*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -1 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -1*16), xmm1)
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 3 addpd(xmm3, xmm11) // iteration 3
"movaps -1 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -1*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addq $4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) add(imm(4*4*8), rax) // a += 4*4 (unroll x mr)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps 0 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, 0*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -8 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -8*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -7 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -7*16), xmm1)
" \n\t"
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1; dec(rsi) // i -= 1;
"jne .LOOPKITER \n\t" // iterate again if i != 0. jne(.LOOPKITER) // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".CONSIDERKLEFT: \n\t" label(.CONSIDERKLEFT)
" \n\t"
"movq %1, %%rsi \n\t" // i = k_left; mov(%1, rsi) // i = k_left;
"testq %%rsi, %%rsi \n\t" // check i via logical AND. test(rsi, rsi) // check i via logical AND.
"je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. je(.POSTACCUM) // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter k_left loop. // else, we prepare to enter k_left loop.
" \n\t"
" \n\t"
".LOOPKLEFT: \n\t" // EDGE LOOP label(.LOOPKLEFT) // EDGE LOOP
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0 addpd(xmm3, xmm11) // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t" movaps(mem(rbx, -7*16), xmm3)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"movaps %%xmm2, %%xmm4 \n\t" movaps(xmm2, xmm4)
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t" pshufd(imm(0x4e), xmm2, xmm7)
"mulpd %%xmm0, %%xmm2 \n\t" mulpd(xmm0, xmm2)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
"movaps %%xmm7, %%xmm6 \n\t" movaps(xmm7, xmm6)
"mulpd %%xmm0, %%xmm7 \n\t" mulpd(xmm0, xmm7)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t" addpd(xmm2, xmm9)
"movaps -6 * 16(%%rbx), %%xmm2 \n\t" movaps(mem(rbx, -6*16), xmm2)
"addpd %%xmm4, %%xmm13 \n\t" addpd(xmm4, xmm13)
"movaps %%xmm3, %%xmm4 \n\t" movaps(xmm3, xmm4)
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t" pshufd(imm(0x4e), xmm3, xmm5)
"mulpd %%xmm0, %%xmm3 \n\t" mulpd(xmm0, xmm3)
"mulpd %%xmm1, %%xmm4 \n\t" mulpd(xmm1, xmm4)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t" addpd(xmm7, xmm8)
"addpd %%xmm6, %%xmm12 \n\t" addpd(xmm6, xmm12)
"movaps %%xmm5, %%xmm6 \n\t" movaps(xmm5, xmm6)
"mulpd %%xmm0, %%xmm5 \n\t" mulpd(xmm0, xmm5)
"movaps -6 * 16(%%rax), %%xmm0 \n\t" movaps(mem(rax, -6*16), xmm0)
"mulpd %%xmm1, %%xmm6 \n\t" mulpd(xmm1, xmm6)
"movaps -5 * 16(%%rax), %%xmm1 \n\t" movaps(mem(rax, -5*16), xmm1)
" \n\t"
" \n\t"
"addq $4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr) add(imm(4*1*8), rax) // a += 4 (1 x mr)
"addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) add(imm(4*1*8), rbx) // b += 4 (1 x nr)
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1; dec(rsi) // i -= 1;
"jne .LOOPKLEFT \n\t" // iterate again if i != 0. jne(.LOOPKLEFT) // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".POSTACCUM: \n\t" label(.POSTACCUM)
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" addpd(xmm3, xmm11)
"addpd %%xmm4, %%xmm15 \n\t" addpd(xmm4, xmm15)
"addpd %%xmm5, %%xmm10 \n\t" addpd(xmm5, xmm10)
"addpd %%xmm6, %%xmm14 \n\t" addpd(xmm6, xmm14)
" \n\t"
" \n\t"
" \n\t"
"movq %5, %%rbx \n\t" // load address of b11. mov(%5, rbx) // load address of b11.
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11: // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02 // ( ab01 ( ab00 ( ab03 ( ab02
" \n\t" // ab10 ) ab11 ) ab12 ) ab13 ) // ab10 ) ab11 ) ab12 ) ab13 )
" \n\t" // //
" \n\t" // xmm12: xmm13: xmm14: xmm15: // xmm12: xmm13: xmm14: xmm15:
" \n\t" // ( ab21 ( ab20 ( ab23 ( ab22 // ( ab21 ( ab20 ( ab23 ( ab22
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) // ab30 ) ab31 ) ab32 ) ab33 )
"movaps %%xmm9, %%xmm0 \n\t" movaps(xmm9, xmm0)
"movaps %%xmm8, %%xmm1 \n\t" movaps(xmm8, xmm1)
"unpcklpd %%xmm8, %%xmm0 \n\t" unpcklpd(xmm8, xmm0)
"unpckhpd %%xmm9, %%xmm1 \n\t" unpckhpd(xmm9, xmm1)
" \n\t"
"movaps %%xmm11, %%xmm4 \n\t" movaps(xmm11, xmm4)
"movaps %%xmm10, %%xmm5 \n\t" movaps(xmm10, xmm5)
"unpcklpd %%xmm10, %%xmm4 \n\t" unpcklpd(xmm10, xmm4)
"unpckhpd %%xmm11, %%xmm5 \n\t" unpckhpd(xmm11, xmm5)
" \n\t"
"movaps %%xmm13, %%xmm2 \n\t" movaps(xmm13, xmm2)
"movaps %%xmm12, %%xmm3 \n\t" movaps(xmm12, xmm3)
"unpcklpd %%xmm12, %%xmm2 \n\t" unpcklpd(xmm12, xmm2)
"unpckhpd %%xmm13, %%xmm3 \n\t" unpckhpd(xmm13, xmm3)
" \n\t"
"movaps %%xmm15, %%xmm6 \n\t" movaps(xmm15, xmm6)
"movaps %%xmm14, %%xmm7 \n\t" movaps(xmm14, xmm7)
"unpcklpd %%xmm14, %%xmm6 \n\t" unpcklpd(xmm14, xmm6)
"unpckhpd %%xmm15, %%xmm7 \n\t" unpckhpd(xmm15, xmm7)
" \n\t"
" \n\t" // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
" \n\t" // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
" \n\t"
"movq %9, %%rax \n\t" // load address of alpha mov(%9, rax) // load address of alpha
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate movddup(mem(rax), xmm15) // load alpha and duplicate
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t" movaps(mem(rbx, 0*16), xmm8)
"movaps 1 * 16(%%rbx), %%xmm12 \n\t" movaps(mem(rbx, 1*16), xmm12)
"mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 ) mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 )
"mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 ) mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t" movaps(mem(rbx, 2*16), xmm9)
"movaps 3 * 16(%%rbx), %%xmm13 \n\t" movaps(mem(rbx, 3*16), xmm13)
"mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 ) mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 )
"mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 ) mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t" movaps(mem(rbx, 4*16), xmm10)
"movaps 5 * 16(%%rbx), %%xmm14 \n\t" movaps(mem(rbx, 5*16), xmm14)
"mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 ) mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 )
"mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 ) mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t" movaps(mem(rbx, 6*16), xmm11)
"mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 ) mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
"mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 ) mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
" \n\t"
" \n\t" // (Now scaled by alpha:) // (Now scaled by alpha:)
" \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 )
" \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 )
" \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
" \n\t" // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
" \n\t"
"subpd %%xmm0, %%xmm8 \n\t" // xmm8 -= xmm0 subpd(xmm0, xmm8) // xmm8 -= xmm0
"subpd %%xmm1, %%xmm9 \n\t" // xmm9 -= xmm1 subpd(xmm1, xmm9) // xmm9 -= xmm1
"subpd %%xmm2, %%xmm10 \n\t" // xmm10 -= xmm2 subpd(xmm2, xmm10) // xmm10 -= xmm2
"subpd %%xmm3, %%xmm11 \n\t" // xmm11 -= xmm3 subpd(xmm3, xmm11) // xmm11 -= xmm3
"subpd %%xmm4, %%xmm12 \n\t" // xmm12 -= xmm4 subpd(xmm4, xmm12) // xmm12 -= xmm4
"subpd %%xmm5, %%xmm13 \n\t" // xmm13 -= xmm5 subpd(xmm5, xmm13) // xmm13 -= xmm5
"subpd %%xmm6, %%xmm14 \n\t" // xmm14 -= xmm6 subpd(xmm6, xmm14) // xmm14 -= xmm6
"subpd %%xmm7, %%xmm15 \n\t" // xmm15 -= xmm7 subpd(xmm7, xmm15) // xmm15 -= xmm7
" \n\t"
" \n\t"
" \n\t"
".TRSM: \n\t" label(.TRSM)
" \n\t"
" \n\t"
"movq %3, %%rax \n\t" // load address of a11 mov(%3, rax) // load address of a11
"movq %6, %%rcx \n\t" // load address of c11 mov(%6, rcx) // load address of c11
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c mov(%7, rsi) // load rs_c
"movq %8, %%rdi \n\t" // load cs_c mov(%8, rdi) // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) sal(imm(3), rsi) // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double )
" \n\t"
"addq %%rsi, %%rcx \n\t" // c11 += (4-1)*rs_c add(rsi, rcx) // c11 += (4-1)*rs_c
"addq %%rsi, %%rcx \n\t" add(rsi, rcx)
"addq %%rsi, %%rcx \n\t" add(rsi, rcx)
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c; lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0 // iteration 0
" \n\t"
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
" \n\t"
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c sub(rsi, rcx) // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1 // iteration 1
" \n\t"
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
"movddup (2+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha23 movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
" \n\t"
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 movaps(xmm3, xmm7) // xmm7 = xmm3
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha23 * ( beta30 beta31 ) mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha23 * ( beta32 beta33 ) mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
"subpd %%xmm3, %%xmm10 \n\t" // xmm10 -= xmm3 subpd(xmm3, xmm10) // xmm10 -= xmm3
"subpd %%xmm7, %%xmm14 \n\t" // xmm14 -= xmm7 subpd(xmm7, xmm14) // xmm14 -= xmm7
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c sub(rsi, rcx) // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2 // iteration 2
" \n\t"
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
"movddup (1+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha12 movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
"movddup (1+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha13 movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
" \n\t"
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 movaps(xmm2, xmm6) // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 movaps(xmm3, xmm7) // xmm7 = xmm3
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha12 * ( beta20 beta21 ) mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha12 * ( beta22 beta23 ) mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha13 * ( beta30 beta31 ) mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha13 * ( beta32 beta33 ) mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 )
"addpd %%xmm3, %%xmm2 \n\t" // xmm2 += xmm3; addpd(xmm3, xmm2) // xmm2 += xmm3;
"addpd %%xmm7, %%xmm6 \n\t" // xmm6 += xmm7; addpd(xmm7, xmm6) // xmm6 += xmm7;
"subpd %%xmm2, %%xmm9 \n\t" // xmm9 -= xmm2 subpd(xmm2, xmm9) // xmm9 -= xmm2
"subpd %%xmm6, %%xmm13 \n\t" // xmm13 -= xmm6 subpd(xmm6, xmm13) // xmm13 -= xmm6
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c sub(rsi, rcx) // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3 // iteration 3
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
"movddup (0+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha01 movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
"movddup (0+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha02 movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
"movddup (0+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha03 movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
" \n\t"
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 movaps(xmm1, xmm5) // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 movaps(xmm2, xmm6) // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 movaps(xmm3, xmm7) // xmm7 = xmm3
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha01 * ( beta10 beta11 ) mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha01 * ( beta12 beta13 ) mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha02 * ( beta20 beta21 ) mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha02 * ( beta22 beta23 ) mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha03 * ( beta30 beta31 ) mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha03 * ( beta32 beta33 ) mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 )
"addpd %%xmm2, %%xmm1 \n\t" // xmm1 += xmm2; addpd(xmm2, xmm1) // xmm1 += xmm2;
"addpd %%xmm6, %%xmm5 \n\t" // xmm5 += xmm6; addpd(xmm6, xmm5) // xmm5 += xmm6;
"addpd %%xmm3, %%xmm1 \n\t" // xmm1 += xmm3; addpd(xmm3, xmm1) // xmm1 += xmm3;
"addpd %%xmm7, %%xmm5 \n\t" // xmm5 += xmm7; addpd(xmm7, xmm5) // xmm5 += xmm7;
"subpd %%xmm1, %%xmm8 \n\t" // xmm8 -= xmm1 subpd(xmm1, xmm8) // xmm8 -= xmm1
"subpd %%xmm5, %%xmm12 \n\t" // xmm12 -= xmm5 subpd(xmm5, xmm12) // xmm12 -= xmm5
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
" \n\t"
" \n\t"
" \n\t"
: // output operands (none) : // output operands (none)
: // input operands : // input operands
@@ -526,3 +529,4 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
} }

View File

@@ -34,6 +34,9 @@
#include "blis.h" #include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0 #if 0
void bli_strsm_l_penryn_asm_8x4 void bli_strsm_l_penryn_asm_8x4
( (
@@ -63,138 +66,138 @@ void bli_dtrsm_l_penryn_asm_4x4
__asm__ volatile __asm__ volatile
( (
" \n\t"
"movq %1, %%rbx \n\t" // load address of b11. mov(%1, rbx) // load address of b11.
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t" // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
"movaps 1 * 16(%%rbx), %%xmm12 \n\t" // xmm9 = ( beta02 beta03 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t" // xmm10 = ( beta10 beta11 ) movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
"movaps 3 * 16(%%rbx), %%xmm13 \n\t" // xmm11 = ( beta12 beta13 ) movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t" // xmm12 = ( beta20 beta21 ) movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 )
"movaps 5 * 16(%%rbx), %%xmm14 \n\t" // xmm13 = ( beta22 beta23 ) movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t" // xmm14 = ( beta30 beta31 ) movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
"movaps 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = ( beta32 beta33 ) movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rax \n\t" // load address of a11 mov(%0, rax) // load address of a11
"movq %2, %%rcx \n\t" // load address of c11 mov(%2, rcx) // load address of c11
" \n\t"
"movq %3, %%rsi \n\t" // load rs_c mov(%3, rsi) // load rs_c
"movq %4, %%rdi \n\t" // load cs_c mov(%4, rdi) // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) sal(imm(3), rsi) // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double )
" \n\t"
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0 // iteration 0
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
" \n\t"
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c add(rsi, rcx) // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c add(rsi, rdx) // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1 // iteration 1
" \n\t"
"movddup (1+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha10 movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 movaps(xmm0, xmm4) // xmm4 = xmm0
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 ) mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha10 * ( beta02 beta03 ) mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
"subpd %%xmm0, %%xmm9 \n\t" // xmm9 -= xmm0 subpd(xmm0, xmm9) // xmm9 -= xmm0
"subpd %%xmm4, %%xmm13 \n\t" // xmm13 -= xmm4 subpd(xmm4, xmm13) // xmm13 -= xmm4
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c add(rsi, rcx) // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c add(rsi, rdx) // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2 // iteration 2
" \n\t"
"movddup (2+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha20 movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
"movddup (2+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha21 movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 movaps(xmm0, xmm4) // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 movaps(xmm1, xmm5) // xmm5 = xmm1
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 ) mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha20 * ( beta02 beta03 ) mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 ) mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha21 * ( beta12 beta13 ) mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; addpd(xmm1, xmm0) // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; addpd(xmm5, xmm4) // xmm4 += xmm5;
"subpd %%xmm0, %%xmm10 \n\t" // xmm10 -= xmm0 subpd(xmm0, xmm10) // xmm10 -= xmm0
"subpd %%xmm4, %%xmm14 \n\t" // xmm14 -= xmm4 subpd(xmm4, xmm14) // xmm14 -= xmm4
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c add(rsi, rcx) // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c add(rsi, rdx) // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3 // iteration 3
" \n\t"
"movddup (3+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha30 movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
"movddup (3+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha31 movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
"movddup (3+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha32 movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0 movaps(xmm0, xmm4) // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 movaps(xmm1, xmm5) // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 movaps(xmm2, xmm6) // xmm6 = xmm2
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 ) mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha30 * ( beta02 beta03 ) mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 ) mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha31 * ( beta12 beta13 ) mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 ) mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha32 * ( beta22 beta23 ) mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; addpd(xmm1, xmm0) // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5; addpd(xmm5, xmm4) // xmm4 += xmm5;
"addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2; addpd(xmm2, xmm0) // xmm0 += xmm2;
"addpd %%xmm6, %%xmm4 \n\t" // xmm4 += xmm6; addpd(xmm6, xmm4) // xmm4 += xmm6;
"subpd %%xmm0, %%xmm11 \n\t" // xmm11 -= xmm0 subpd(xmm0, xmm11) // xmm11 -= xmm0
"subpd %%xmm4, %%xmm15 \n\t" // xmm15 -= xmm4 subpd(xmm4, xmm15) // xmm15 -= xmm4
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
" \n\t"
" \n\t"
" \n\t"
: // output operands (none) : // output operands (none)
: // input operands : // input operands
@@ -214,3 +217,4 @@ void bli_dtrsm_l_penryn_asm_4x4
} }

View File

@@ -34,6 +34,9 @@
#include "blis.h" #include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0 #if 0
void bli_strsm_u_penryn_asm_8x4 void bli_strsm_u_penryn_asm_8x4
( (
@@ -63,141 +66,141 @@ void bli_dtrsm_u_penryn_asm_4x4
__asm__ volatile __asm__ volatile
( (
" \n\t"
"movq %1, %%rbx \n\t" // load address of b11. mov(%1, rbx) // load address of b11.
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t" // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
"movaps 1 * 16(%%rbx), %%xmm12 \n\t" // xmm9 = ( beta02 beta03 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t" // xmm10 = ( beta10 beta11 ) movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
"movaps 3 * 16(%%rbx), %%xmm13 \n\t" // xmm11 = ( beta12 beta13 ) movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t" // xmm12 = ( beta20 beta21 ) movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 )
"movaps 5 * 16(%%rbx), %%xmm14 \n\t" // xmm13 = ( beta22 beta23 ) movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t" // xmm14 = ( beta30 beta31 ) movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
"movaps 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = ( beta32 beta33 ) movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rax \n\t" // load address of a11 mov(%0, rax) // load address of a11
"movq %2, %%rcx \n\t" // load address of c11 mov(%2, rcx) // load address of c11
" \n\t"
"movq %3, %%rsi \n\t" // load rs_c mov(%3, rsi) // load rs_c
"movq %4, %%rdi \n\t" // load cs_c mov(%4, rdi) // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double ) sal(imm(3), rsi) // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double )
" \n\t"
"addq %%rsi, %%rcx \n\t" // c11 += (4-1)*rs_c add(rsi, rcx) // c11 += (4-1)*rs_c
"addq %%rsi, %%rcx \n\t" add(rsi, rcx)
"addq %%rsi, %%rcx \n\t" add(rsi, rcx)
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c; lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0 // iteration 0
" \n\t"
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
" \n\t"
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33); mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11 movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0] movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c sub(rsi, rcx) // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1 // iteration 1
" \n\t"
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
"movddup (2+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha23 movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
" \n\t"
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 movaps(xmm3, xmm7) // xmm7 = xmm3
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha23 * ( beta30 beta31 ) mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha23 * ( beta32 beta33 ) mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
"subpd %%xmm3, %%xmm10 \n\t" // xmm10 -= xmm3 subpd(xmm3, xmm10) // xmm10 -= xmm3
"subpd %%xmm7, %%xmm14 \n\t" // xmm14 -= xmm7 subpd(xmm7, xmm14) // xmm14 -= xmm7
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22); mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10 movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0] movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c sub(rsi, rcx) // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2 // iteration 2
" \n\t"
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
"movddup (1+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha12 movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
"movddup (1+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha13 movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
" \n\t"
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 movaps(xmm2, xmm6) // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 movaps(xmm3, xmm7) // xmm7 = xmm3
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha12 * ( beta20 beta21 ) mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha12 * ( beta22 beta23 ) mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha13 * ( beta30 beta31 ) mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha13 * ( beta32 beta33 ) mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 )
"addpd %%xmm3, %%xmm2 \n\t" // xmm2 += xmm3; addpd(xmm3, xmm2) // xmm2 += xmm3;
"addpd %%xmm7, %%xmm6 \n\t" // xmm6 += xmm7; addpd(xmm7, xmm6) // xmm6 += xmm7;
"subpd %%xmm2, %%xmm9 \n\t" // xmm9 -= xmm2 subpd(xmm2, xmm9) // xmm9 -= xmm2
"subpd %%xmm6, %%xmm13 \n\t" // xmm13 -= xmm6 subpd(xmm6, xmm13) // xmm13 -= xmm6
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11); mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9 movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0] movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c sub(rsi, rcx) // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3 // iteration 3
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
"movddup (0+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha01 movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
"movddup (0+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha02 movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
"movddup (0+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha03 movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
" \n\t"
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1 movaps(xmm1, xmm5) // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2 movaps(xmm2, xmm6) // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3 movaps(xmm3, xmm7) // xmm7 = xmm3
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha01 * ( beta10 beta11 ) mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha01 * ( beta12 beta13 ) mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha02 * ( beta20 beta21 ) mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha02 * ( beta22 beta23 ) mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha03 * ( beta30 beta31 ) mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha03 * ( beta32 beta33 ) mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 )
"addpd %%xmm2, %%xmm1 \n\t" // xmm1 += xmm2; addpd(xmm2, xmm1) // xmm1 += xmm2;
"addpd %%xmm6, %%xmm5 \n\t" // xmm5 += xmm6; addpd(xmm6, xmm5) // xmm5 += xmm6;
"addpd %%xmm3, %%xmm1 \n\t" // xmm1 += xmm3; addpd(xmm3, xmm1) // xmm1 += xmm3;
"addpd %%xmm7, %%xmm5 \n\t" // xmm5 += xmm7; addpd(xmm7, xmm5) // xmm5 += xmm7;
"subpd %%xmm1, %%xmm8 \n\t" // xmm8 -= xmm1 subpd(xmm1, xmm8) // xmm8 -= xmm1
"subpd %%xmm5, %%xmm12 \n\t" // xmm12 -= xmm5 subpd(xmm5, xmm12) // xmm12 -= xmm5
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00); mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8 movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0] movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
" \n\t"
" \n\t"
" \n\t"
: // output operands (none) : // output operands (none)
: // input operands : // input operands
@@ -217,3 +220,4 @@ void bli_dtrsm_u_penryn_asm_4x4
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -34,7 +34,8 @@
#include "blis.h" #include "blis.h"
#include "bli_avx512_macros.h" #define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define A_L1_PREFETCH_DIST 4 //should be multiple of 2 #define A_L1_PREFETCH_DIST 4 //should be multiple of 2
@@ -305,8 +306,7 @@ void bli_dgemm_skx_asm_16x12_l2(
const int64_t rs_c = rs_c_; const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_; const int64_t cs_c = cs_c_;
__asm__ volatile BEGIN_ASM
(
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8)) VMOVAPD(YMM( 7), YMM(8))
@@ -525,6 +525,7 @@ void bli_dgemm_skx_asm_16x12_l2(
VZEROUPPER() VZEROUPPER()
END_ASM(
: // output operands : // output operands
: // input operands : // input operands
[k] "m" (k), [k] "m" (k),
@@ -543,5 +544,5 @@ void bli_dgemm_skx_asm_16x12_l2(
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory" "zmm30", "zmm31", "memory"
); )
} }

View File

@@ -34,7 +34,8 @@
#include "blis.h" #include "blis.h"
#include "bli_avx512_macros.h" #define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define CACHELINE_SIZE 64 //size of cache line in bytes #define CACHELINE_SIZE 64 //size of cache line in bytes
@@ -335,8 +336,7 @@ void bli_sgemm_skx_asm_32x12_l2(
const int64_t rs_c = rs_c_; const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_; const int64_t cs_c = cs_c_;
__asm__ volatile BEGIN_ASM
(
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8)) VMOVAPD(YMM( 7), YMM(8))
@@ -550,6 +550,7 @@ void bli_sgemm_skx_asm_32x12_l2(
VZEROUPPER() VZEROUPPER()
END_ASM(
: // output operands : // output operands
: // input operands : // input operands
[k] "m" (k), [k] "m" (k),
@@ -568,5 +569,5 @@ void bli_sgemm_skx_asm_32x12_l2(
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory" "zmm30", "zmm31", "memory"
); )
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff