Merge pull request #224 from devinamatthews/asm-macros

Asm macros
This commit is contained in:
Devin Matthews
2018-06-20 15:37:53 -05:00
committed by GitHub
20 changed files with 18702 additions and 17651 deletions

View File

@@ -1,173 +0,0 @@
#ifndef BLIS_AVX512_MACROS_H
#define BLIS_AVX512_MACROS_H
//
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
//
#define COMMENT_BEGIN "#"
#define COMMENT_END
#define STRINGIFY(...) #__VA_ARGS__
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
#define LABEL(label) STRINGIFY(label) ":\n\t"
#define XMM(x) %%xmm##x
#define YMM(x) %%ymm##x
#define ZMM(x) %%zmm##x
#define EAX %%eax
#define EBX %%ebx
#define ECX %%ecx
#define EDX %%edx
#define EBP %%ebp
#define EDI %%edi
#define ESI %%esi
#define RAX %%rax
#define RBX %%rbx
#define RCX %%rcx
#define RDX %%rdx
#define RBP %%rbp
#define RDI %%rdi
#define RSI %%rsi
#define K(x) %%k##x
#define R(x) %%r##x
#define R8 %%r8
#define R9 %%r9
#define R10 %%r10
#define R11 %%r11
#define R12 %%r12
#define R13 %%r13
#define R14 %%r14
#define R15 %%r15
#define RD(x) %%r##x##d
#define R8D %%r8d
#define R9D %%r9d
#define R10D %%r10d
#define R11D %%r11d
#define R12D %%r12d
#define R13D %%r13d
#define R14D %%r14d
#define R15D %%r15d
#define IMM(x) $##x
#define VAR(x) %[x]
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
#define MEM_3(reg,off,scale) (reg,off,scale)
#define MEM_2(reg,disp) disp(reg)
#define MEM_1(reg) (reg)
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
#define MASK_K(n) %{%%k##n%}
#define MASK_KZ(n) %{%%k##n%}%{z%}
#define KMOV(to,from) ASM(kmovw from, to)
#define JKNZD(kreg,label) \
ASM(kortestw kreg, kreg) \
ASM(jnz label)
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
#define ALIGN16 ASM(.p2align 4)
#define ALIGN32 ASM(.p2align 5)
#define RDTSC ASM(rdstc)
#define MOV(_0, _1) ASM(mov _1, _0)
#define MOVD(_0, _1) ASM(movd _1, _0)
#define MOVL(_0, _1) ASM(movl _1, _0)
#define MOVQ(_0, _1) ASM(movq _1, _0)
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
#define CMP(_0, _1) ASM(cmp _1, _0)
#define AND(_0, _1) ASM(and _1, _0)
#define ADD(_0, _1) ASM(add _1, _0)
#define SUB(_0, _1) ASM(sub _1, _0)
#define SAL(_0, _1) ASM(sal _1, _0)
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
#define SAR(_0, _1) ASM(sar _1, _0)
#define SAL1(_0) ASM(sal _0)
#define SAR1(_0) ASM(sar _0)
#define LEA(_0, _1) ASM(lea _1, _0)
#define TEST(_0, _1) ASM(test _1, _0)
#define DEC(_0) ASM(dec _0)
#define JLE(_0) ASM(jle _0)
#define JL(_0) ASM(jl _0)
#define JNZ(_0) ASM(jnz _0)
#define JZ(_0) ASM(jz _0)
#define JNE(_0) ASM(jne _0)
#define JE(_0) ASM(je _0)
#define JNC(_0) ASM(jnc _0)
#define JC(_0) ASM(jc _0)
#define JMP(_0) ASM(jmp _0)
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0)
#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0)
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
#define VZEROUPPER() ASM(vzeroupper)
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -32,9 +32,11 @@
*/
#include "bli_avx512_macros.h"
#include "blis.h"
#define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
z0,z1,z2,z3,z4,z5,z6,z7) \
\
@@ -125,156 +127,157 @@ void bli_dpackm_knl_asm_8xk
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
MOV(RCX, VAR(lda))
MOV(R14, VAR(p))
MOV(RDI, VAR(ldp))
BEGIN_ASM
TEST(RSI, RSI)
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
MOV(RCX, VAR(lda))
MOV(R14, VAR(p))
MOV(RDI, VAR(ldp))
TEST(RSI, RSI)
JZ(PACK8_DONE)
LEA(RBX, MEM(,RBX,8)) //inca in bytes
LEA(RCX, MEM(,RCX,8)) //lda in bytes
LEA(RDI, MEM(,RDI,8)) //ldp in bytes
LEA(R11, MEM(RDI,RDI,2)) //ldp*3
LEA(R12, MEM(RDI,RDI,4)) //ldp*5
LEA(R13, MEM(R11,RDI,4)) //ldp*7
VBROADCASTSD(ZMM(31), VAR(kappa))
CMP(RBX, IMM(8))
JNE(PACK8_T)
LABEL(PACK8_N)
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_N_TAIL)
LEA(R8, MEM(RCX,RCX,2)) //lda*3
LEA(R9, MEM(RCX,RCX,4)) //lda*5
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
LABEL(PACK8_N_LOOP)
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,RDI,8))
SUB(RSI, IMM(1))
JNZ(PACK8_N_LOOP)
TEST(RDX, RDX)
JZ(PACK8_DONE)
LEA(RBX, MEM(,RBX,8)) //inca in bytes
LEA(RCX, MEM(,RCX,8)) //lda in bytes
LEA(RDI, MEM(,RDI,8)) //ldp in bytes
LEA(R11, MEM(RDI,RDI,2)) //ldp*3
LEA(R12, MEM(RDI,RDI,4)) //ldp*5
LEA(R13, MEM(R11,RDI,4)) //ldp*7
LABEL(PACK8_N_TAIL)
VBROADCASTSD(ZMM(31), VAR(kappa))
VMULPD(ZMM(0), ZMM(31), MEM(RAX))
VMOVUPD(MEM(R14), ZMM(0))
CMP(RBX, IMM(8))
JNE(PACK8_T)
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
LABEL(PACK8_N)
SUB(RDX, IMM(1))
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_N_TAIL)
JNZ(PACK8_N_TAIL)
LEA(R8, MEM(RCX,RCX,2)) //lda*3
LEA(R9, MEM(RCX,RCX,4)) //lda*5
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
JMP(PACK8_DONE)
LABEL(PACK8_N_LOOP)
LABEL(PACK8_T)
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
CMP(RCX, IMM(8))
JNE(PACK8_G)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,RDI,8))
LEA(R8, MEM(RBX,RBX,2)) //inca*3
LEA(R9, MEM(RBX,RBX,4)) //inca*5
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
SUB(RSI, IMM(1))
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_T_TAIL)
JNZ(PACK8_N_LOOP)
LABEL(PACK8_T_LOOP)
TEST(RDX, RDX)
JZ(PACK8_DONE)
LABEL(PACK8_N_TAIL)
VMULPD(ZMM(0), ZMM(31), MEM(RAX))
VMOVUPD(MEM(R14), ZMM(0))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
SUB(RDX, IMM(1))
JNZ(PACK8_N_TAIL)
JMP(PACK8_DONE)
LABEL(PACK8_T)
CMP(RCX, IMM(8))
JNE(PACK8_G)
LEA(R8, MEM(RBX,RBX,2)) //inca*3
LEA(R9, MEM(RBX,RBX,4)) //inca*5
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK8_T_TAIL)
LABEL(PACK8_T_LOOP)
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
16,17,18,19,20,21,22,23)
STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,RDI,8))
SUB(RSI, IMM(1))
JNZ(PACK8_T_LOOP)
TEST(RDX, RDX)
JZ(PACK8_DONE)
LABEL(PACK8_T_TAIL)
MOV(RSI, IMM(1))
SHLX(RSI, RSI, RDX)
SUB(RSI, IMM(1))
KMOV(K(1), ESI) //mask for n%8 elements
LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
8, 9,10,11,12,13,14,15)
16,17,18,19,20,21,22,23)
STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
VMOVUPD(MEM(R14 ), ZMM( 8))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,2), ZMM(10))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,1), ZMM(11))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,4), ZMM(12))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R12,1), ZMM(13))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,2), ZMM(14))
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,RDI,8))
JMP(PACK8_DONE)
SUB(RSI, IMM(1))
LABEL(PACK8_G)
JNZ(PACK8_T_LOOP)
VPBROADCASTD(ZMM(3), VAR(inca))
MOV(RBX, VAR(offsetPtr))
VPMULLD(YMM(0), YMM(3), MEM(RBX))
TEST(RDX, RDX)
JZ(PACK8_DONE)
LABEL(PACK8_G_LOOP)
LABEL(PACK8_T_TAIL)
KXNORW(K(1), K(0), K(0))
VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
VMULPD(ZMM(3), ZMM(3), ZMM(31))
VMOVUPD(MEM(R14), ZMM(3))
MOV(RSI, IMM(1))
SHLX(RSI, RSI, RDX)
SUB(RSI, IMM(1))
KMOVW(K(1), ESI) //mask for n%8 elements
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
8, 9,10,11,12,13,14,15)
SUB(RSI, IMM(1))
VMOVUPD(MEM(R14 ), ZMM( 8))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,2), ZMM(10))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,1), ZMM(11))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,RDI,4), ZMM(12))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R12,1), ZMM(13))
SUB(RDX, IMM(1))
JZ(PACK8_DONE)
VMOVUPD(MEM(R14,R11,2), ZMM(14))
JNZ(PACK8_G_LOOP)
JMP(PACK8_DONE)
LABEL(PACK8_DONE)
LABEL(PACK8_G)
VPBROADCASTD(ZMM(3), VAR(inca))
MOV(RBX, VAR(offsetPtr))
VPMULLD(YMM(0), YMM(3), MEM(RBX))
LABEL(PACK8_G_LOOP)
KXNORW(K(1), K(0), K(0))
VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
VMULPD(ZMM(3), ZMM(3), ZMM(31))
VMOVUPD(MEM(R14), ZMM(3))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
SUB(RSI, IMM(1))
JNZ(PACK8_G_LOOP)
LABEL(PACK8_DONE)
END_ASM(
: //output operands
: //input operands
[n] "m" (n),
@@ -294,7 +297,7 @@ void bli_dpackm_knl_asm_8xk
"zmm30", "zmm31",
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
);
)
}
void bli_dpackm_knl_asm_24xk
@@ -441,7 +444,7 @@ void bli_dpackm_knl_asm_24xk
MOV(R13, IMM(1))
SHLX(R13, R13, RSI)
SUB(R13, IMM(1))
KMOV(K(1), R13D) //mask for n%8 elements
KMOVW(K(1), R13D) //mask for n%8 elements
LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1)
LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1)

View File

@@ -32,10 +32,10 @@
*/
#include "bli_avx512_macros.h"
#include "blis.h"
#include <stdio.h>
#define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
z0,z1,z2,z3,z4,z5,z6,z7) \

View File

@@ -35,7 +35,8 @@
#include "blis.h"
#include <assert.h>
#include "bli_avx512_macros.h"
#define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define UNROLL_K 32
@@ -212,8 +213,8 @@ void bli_dgemm_knl_asm_24x8
int tlooph, tloopl, blooph, bloopl;
#endif
__asm__ volatile
(
BEGIN_ASM
#ifdef MONITORS
RDTSC
MOV(VAR(topl), EAX)
@@ -380,7 +381,7 @@ void bli_dgemm_knl_asm_24x8
JNZ(MAIN_LOOP)
LABEL(REM_1)
SAR1(RDI)
SAR(RDI)
JNC(REM_2)
SUBITER(0,1,0,RAX)
@@ -389,7 +390,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM( 8*8))
LABEL(REM_2)
SAR1(RDI)
SAR(RDI)
JNC(REM_4)
SUBITER(0,1,0,RAX)
@@ -398,7 +399,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM(2* 8*8))
LABEL(REM_4)
SAR1(RDI)
SAR(RDI)
JNC(REM_8)
SUBITER(0,1,0,RAX)
@@ -409,7 +410,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM(4* 8*8))
LABEL(REM_8)
SAR1(RDI)
SAR(RDI)
JNC(REM_16)
SUBITER(0,1,0,RAX )
@@ -424,7 +425,7 @@ void bli_dgemm_knl_asm_24x8
ADD(RBX, IMM(8* 8*8))
LABEL(REM_16)
SAR1(RDI)
SAR(RDI)
JNC(AFTER_LOOP)
SUBITER( 0,1,0,RAX )
@@ -570,7 +571,7 @@ void bli_dgemm_knl_asm_24x8
JNE(SCATTEREDUPDATE)
VMOVQ(RDX, XMM(1))
SAL1(RDX) //shift out sign bit
SAL(RDX) //shift out sign bit
JZ(COLSTORBZ)
UPDATE_C_FOUR_ROWS( 8, 9,10,11)
@@ -602,7 +603,7 @@ void bli_dgemm_knl_asm_24x8
VPMULLD(ZMM(2), ZMM(3), ZMM(2))
VMOVQ(RDX, XMM(1))
SAL1(RDX) //shift out sign bit
SAL(RDX) //shift out sign bit
JZ(SCATTERBZ)
UPDATE_C_ROW_SCATTERED( 8)
@@ -666,6 +667,8 @@ void bli_dgemm_knl_asm_24x8
MOV(VAR(botl), EAX)
MOV(VAR(both), EDX)
#endif
END_ASM(
: // output operands
#ifdef MONITORS
[topl] "=m" (topl),
@@ -696,7 +699,7 @@ void bli_dgemm_knl_asm_24x8
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory"
);
)
#ifdef LOOPMON
printf("looptime = \t%d\n", bloopl - tloopl);

View File

@@ -35,7 +35,8 @@
#include "blis.h"
#include <assert.h>
#include "bli_avx512_macros.h"
#define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define UNROLL_K 32
@@ -377,7 +378,7 @@ void bli_sgemm_knl_asm_24x16
JNZ(MAIN_LOOP)
LABEL(REM_1)
SAR1(RDI)
SAR(RDI)
JNC(REM_2)
SUBITER(0,1,0,RAX)
@@ -386,7 +387,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(16*4))
LABEL(REM_2)
SAR1(RDI)
SAR(RDI)
JNC(REM_4)
SUBITER(0,1,0,RAX)
@@ -395,7 +396,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(2*16*4))
LABEL(REM_4)
SAR1(RDI)
SAR(RDI)
JNC(REM_8)
SUBITER(0,1,0,RAX)
@@ -406,7 +407,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(4*16*4))
LABEL(REM_8)
SAR1(RDI)
SAR(RDI)
JNC(REM_16)
SUBITER(0,1,0,RAX )
@@ -421,7 +422,7 @@ void bli_sgemm_knl_asm_24x16
ADD(RBX, IMM(8*16*4))
LABEL(REM_16)
SAR1(RDI)
SAR(RDI)
JNC(AFTER_LOOP)
SUBITER( 0,1,0,RAX )
@@ -567,7 +568,7 @@ void bli_sgemm_knl_asm_24x16
JNE(SCATTEREDUPDATE)
VMOVD(EDX, XMM(1))
SAL1(EDX) //shift out sign bit
SAL(EDX) //shift out sign bit
JZ(COLSTORBZ)
UPDATE_C_FOUR_ROWS( 8, 9,10,11)
@@ -599,7 +600,7 @@ void bli_sgemm_knl_asm_24x16
VPMULLD(ZMM(2), ZMM(3), ZMM(2))
VMOVD(EDX, XMM(1))
SAL1(EDX) //shift out sign bit
SAL(EDX) //shift out sign bit
JZ(SCATTERBZ)
UPDATE_C_ROW_SCATTERED( 8)

File diff suppressed because it is too large Load Diff

View File

@@ -34,6 +34,9 @@
#include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0
void bli_sgemmtrsm_l_penryn_asm_8x4
(
@@ -75,446 +78,446 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
__asm__ volatile
(
" \n\t"
"movq %2, %%rax \n\t" // load address of a10.
"movq %4, %%rbx \n\t" // load address of b01.
//"movq %10, %%r9 \n\t" // load address of b_next.
" \n\t"
"subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte
"subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
" \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b.
"movaps -8 * 16(%%rbx), %%xmm2 \n\t"
" \n\t"
//"movq %6, %%rcx \n\t" // load address of c11
//"movq %9, %%rdi \n\t" // load cs_c
//"leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double)
//"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // load address of c + 2*cs_c;
" \n\t"
//"prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next
" \n\t"
"xorpd %%xmm3, %%xmm3 \n\t"
"xorpd %%xmm4, %%xmm4 \n\t"
"xorpd %%xmm5, %%xmm5 \n\t"
"xorpd %%xmm6, %%xmm6 \n\t"
" \n\t"
//"prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
"xorpd %%xmm8, %%xmm8 \n\t"
"movaps %%xmm8, %%xmm9 \n\t"
//"prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
"movaps %%xmm8, %%xmm10 \n\t"
"movaps %%xmm8, %%xmm11 \n\t"
//"prefetcht2 3 * 8(%%rdx) \n\t" // prefetch c + 2*cs_c
"movaps %%xmm8, %%xmm12 \n\t"
"movaps %%xmm8, %%xmm13 \n\t"
//"prefetcht2 3 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 3*cs_c
"movaps %%xmm8, %%xmm14 \n\t"
"movaps %%xmm8, %%xmm15 \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rsi \n\t" // i = k_iter;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that
" \n\t" // contains the k_left loop.
" \n\t"
" \n\t"
".LOOPKITER: \n\t" // MAIN LOOP
" \n\t"
//"prefetcht0 1264(%%rax) \n\t"
"prefetcht0 (4*35+1) * 8(%%rax) \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -6 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 1
"movaps -5 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -4 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -4 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -3 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
//"prefetcht0 1328(%%rax) \n\t"
"prefetcht0 (4*37+1) * 8(%%rax) \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 2
"movaps -3 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -2 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -2 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -1 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 3
"movaps -1 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"subq $-4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr)
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
//"subq $-4 * 4 * 8, %%r9 \n\t" // b_next += 4*4 (unroll x nr)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps 0 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"subq $-4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -7 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
//"prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next[0]
//"prefetcht2 8 * 8(%%r9) \n\t" // prefetch b_next[8]
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .LOOPKITER \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".CONSIDERKLEFT: \n\t"
" \n\t"
"movq %1, %%rsi \n\t" // i = k_left;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .POSTACCUM \n\t" // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter k_left loop.
" \n\t"
" \n\t"
".LOOPKLEFT: \n\t" // EDGE LOOP
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -6 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"subq $-4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr)
"subq $-4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr)
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .LOOPKLEFT \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".POSTACCUM: \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %5, %%rbx \n\t" // load address of b11.
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
" \n\t" // ab10 ) ab11 ) ab12 ) ab13 )
" \n\t" //
" \n\t" // xmm12: xmm13: xmm14: xmm15:
" \n\t" // ( ab21 ( ab20 ( ab23 ( ab22
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
"movaps %%xmm9, %%xmm0 \n\t"
"movaps %%xmm8, %%xmm1 \n\t"
"unpcklpd %%xmm8, %%xmm0 \n\t"
"unpckhpd %%xmm9, %%xmm1 \n\t"
" \n\t"
"movaps %%xmm11, %%xmm4 \n\t"
"movaps %%xmm10, %%xmm5 \n\t"
"unpcklpd %%xmm10, %%xmm4 \n\t"
"unpckhpd %%xmm11, %%xmm5 \n\t"
" \n\t"
"movaps %%xmm13, %%xmm2 \n\t"
"movaps %%xmm12, %%xmm3 \n\t"
"unpcklpd %%xmm12, %%xmm2 \n\t"
"unpckhpd %%xmm13, %%xmm3 \n\t"
" \n\t"
"movaps %%xmm15, %%xmm6 \n\t"
"movaps %%xmm14, %%xmm7 \n\t"
"unpcklpd %%xmm14, %%xmm6 \n\t"
"unpckhpd %%xmm15, %%xmm7 \n\t"
" \n\t"
" \n\t" // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
" \n\t" // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
" \n\t"
"movq %9, %%rax \n\t" // load address of alpha
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t"
"movaps 1 * 16(%%rbx), %%xmm12 \n\t"
"mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 )
"mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t"
"movaps 3 * 16(%%rbx), %%xmm13 \n\t"
"mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 )
"mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t"
"movaps 5 * 16(%%rbx), %%xmm14 \n\t"
"mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 )
"mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 )
"mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 )
" \n\t"
" \n\t" // (Now scaled by alpha:)
" \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 )
" \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 )
" \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
" \n\t" // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
" \n\t"
"subpd %%xmm0, %%xmm8 \n\t" // xmm8 -= xmm0
"subpd %%xmm1, %%xmm9 \n\t" // xmm9 -= xmm1
"subpd %%xmm2, %%xmm10 \n\t" // xmm10 -= xmm2
"subpd %%xmm3, %%xmm11 \n\t" // xmm11 -= xmm3
"subpd %%xmm4, %%xmm12 \n\t" // xmm12 -= xmm4
"subpd %%xmm5, %%xmm13 \n\t" // xmm13 -= xmm5
"subpd %%xmm6, %%xmm14 \n\t" // xmm14 -= xmm6
"subpd %%xmm7, %%xmm15 \n\t" // xmm15 -= xmm7
" \n\t"
" \n\t"
" \n\t"
".TRSM: \n\t"
" \n\t"
" \n\t"
"movq %3, %%rax \n\t" // load address of a11
"movq %6, %%rcx \n\t" // load address of c11
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c
"movq %8, %%rdi \n\t" // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
" \n\t"
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00)
" \n\t"
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1
" \n\t"
"movddup (1+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha10
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha10 * ( beta02 beta03 )
"subpd %%xmm0, %%xmm9 \n\t" // xmm9 -= xmm0
"subpd %%xmm4, %%xmm13 \n\t" // xmm13 -= xmm4
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2
" \n\t"
"movddup (2+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha20
"movddup (2+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha21
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha20 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha21 * ( beta12 beta13 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5;
"subpd %%xmm0, %%xmm10 \n\t" // xmm10 -= xmm0
"subpd %%xmm4, %%xmm14 \n\t" // xmm14 -= xmm4
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3
" \n\t"
"movddup (3+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha30
"movddup (3+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha31
"movddup (3+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha32
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha30 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha31 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha32 * ( beta22 beta23 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5;
"addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2;
"addpd %%xmm6, %%xmm4 \n\t" // xmm4 += xmm6;
"subpd %%xmm0, %%xmm11 \n\t" // xmm11 -= xmm0
"subpd %%xmm4, %%xmm15 \n\t" // xmm15 -= xmm4
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1]
" \n\t"
" \n\t"
" \n\t"
mov(%2, rax) // load address of a10.
mov(%4, rbx) // load address of b01.
//mov(%10, r9) // load address of b_next.
sub(imm(0-8*16), rax) // increment pointers to allow byte
sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
movaps(mem(rax, -7*16), xmm1) // of a and b.
movaps(mem(rbx, -8*16), xmm2)
//mov(%6, rcx) // load address of c11
//mov(%9, rdi) // load cs_c
//lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
//lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c;
//prefetch(2, mem(r9, 0*8)) // prefetch b_next
xorpd(xmm3, xmm3)
xorpd(xmm4, xmm4)
xorpd(xmm5, xmm5)
xorpd(xmm6, xmm6)
//prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c
xorpd(xmm8, xmm8)
movaps(xmm8, xmm9)
//prefetch(2, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
movaps(xmm8, xmm10)
movaps(xmm8, xmm11)
//prefetch(2, mem(rdx, 3*8)) // prefetch c + 2*cs_c
movaps(xmm8, xmm12)
movaps(xmm8, xmm13)
//prefetch(2, mem(rdx, rdi, 1, 3*8)) // prefetch c + 3*cs_c
movaps(xmm8, xmm14)
movaps(xmm8, xmm15)
mov(%0, rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CONSIDERKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
label(.LOOPKITER) // MAIN LOOP
//prefetch(0, mem(rax, 1264))
prefetch(0, mem(4*35+1)*8(rax))
addpd(xmm3, xmm11) // iteration 0
movaps(mem(rbx, -7*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -6*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -6*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -5*16), xmm1)
addpd(xmm3, xmm11) // iteration 1
movaps(mem(rbx, -5*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -4*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -4*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -3*16), xmm1)
//prefetch(0, mem(rax, 1328))
prefetch(0, mem(4*37+1)*8(rax))
addpd(xmm3, xmm11) // iteration 2
movaps(mem(rbx, -3*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -2*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -2*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -1*16), xmm1)
addpd(xmm3, xmm11) // iteration 3
movaps(mem(rbx, -1*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
//sub(imm(-4*4*8), r9) // b_next += 4*4 (unroll x nr)
addpd(xmm2, xmm9)
movaps(mem(rbx, 0*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -8*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -7*16), xmm1)
//prefetch(2, mem(r9, 0*8)) // prefetch b_next[0]
//prefetch(2, mem(r9, 8*8)) // prefetch b_next[8]
dec(rsi) // i -= 1;
jne(.LOOPKITER) // iterate again if i != 0.
label(.CONSIDERKLEFT)
mov(%1, rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.POSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label(.LOOPKLEFT) // EDGE LOOP
addpd(xmm3, xmm11) // iteration 0
movaps(mem(rbx, -7*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -6*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -6*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -5*16), xmm1)
sub(imm(0-4*1*8), rax) // a += 4 (1 x mr)
sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr)
dec(rsi) // i -= 1;
jne(.LOOPKLEFT) // iterate again if i != 0.
label(.POSTACCUM)
addpd(xmm3, xmm11)
addpd(xmm4, xmm15)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
mov(%5, rbx) // load address of b11.
// xmm8: xmm9: xmm10: xmm11:
// ( ab01 ( ab00 ( ab03 ( ab02
// ab10 ) ab11 ) ab12 ) ab13 )
//
// xmm12: xmm13: xmm14: xmm15:
// ( ab21 ( ab20 ( ab23 ( ab22
// ab30 ) ab31 ) ab32 ) ab33 )
movaps(xmm9, xmm0)
movaps(xmm8, xmm1)
unpcklpd(xmm8, xmm0)
unpckhpd(xmm9, xmm1)
movaps(xmm11, xmm4)
movaps(xmm10, xmm5)
unpcklpd(xmm10, xmm4)
unpckhpd(xmm11, xmm5)
movaps(xmm13, xmm2)
movaps(xmm12, xmm3)
unpcklpd(xmm12, xmm2)
unpckhpd(xmm13, xmm3)
movaps(xmm15, xmm6)
movaps(xmm14, xmm7)
unpcklpd(xmm14, xmm6)
unpckhpd(xmm15, xmm7)
// xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
// xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
// xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
// xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
mov(%9, rax) // load address of alpha
movddup(mem(rax), xmm15) // load alpha and duplicate
movaps(mem(rbx, 0*16), xmm8)
movaps(mem(rbx, 1*16), xmm12)
mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 )
mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 )
movaps(mem(rbx, 2*16), xmm9)
movaps(mem(rbx, 3*16), xmm13)
mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 )
mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 )
movaps(mem(rbx, 4*16), xmm10)
movaps(mem(rbx, 5*16), xmm14)
mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 )
mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 )
movaps(mem(rbx, 6*16), xmm11)
mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
// (Now scaled by alpha:)
// xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 )
// xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 )
// xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
// xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
subpd(xmm0, xmm8) // xmm8 -= xmm0
subpd(xmm1, xmm9) // xmm9 -= xmm1
subpd(xmm2, xmm10) // xmm10 -= xmm2
subpd(xmm3, xmm11) // xmm11 -= xmm3
subpd(xmm4, xmm12) // xmm12 -= xmm4
subpd(xmm5, xmm13) // xmm13 -= xmm5
subpd(xmm6, xmm14) // xmm14 -= xmm6
subpd(xmm7, xmm15) // xmm15 -= xmm7
label(.TRSM)
mov(%3, rax) // load address of a11
mov(%6, rcx) // load address of c11
mov(%7, rsi) // load rs_c
mov(%8, rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
// iteration 0
movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
add(rsi, rcx) // c11 += rs_c
add(rsi, rdx) // c11_2 += rs_c
// iteration 1
movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
movaps(xmm0, xmm4) // xmm4 = xmm0
mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
subpd(xmm0, xmm9) // xmm9 -= xmm0
subpd(xmm4, xmm13) // xmm13 -= xmm4
mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
add(rsi, rcx) // c11 += rs_c
add(rsi, rdx) // c11_2 += rs_c
// iteration 2
movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
movaps(xmm0, xmm4) // xmm4 = xmm0
movaps(xmm1, xmm5) // xmm5 = xmm1
mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 )
mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 )
mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 )
addpd(xmm1, xmm0) // xmm0 += xmm1;
addpd(xmm5, xmm4) // xmm4 += xmm5;
subpd(xmm0, xmm10) // xmm10 -= xmm0
subpd(xmm4, xmm14) // xmm14 -= xmm4
mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
add(rsi, rcx) // c11 += rs_c
add(rsi, rdx) // c11_2 += rs_c
// iteration 3
movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
movaps(xmm0, xmm4) // xmm4 = xmm0
movaps(xmm1, xmm5) // xmm5 = xmm1
movaps(xmm2, xmm6) // xmm6 = xmm2
mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 )
mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 )
mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 )
mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 )
mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 )
mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 )
addpd(xmm1, xmm0) // xmm0 += xmm1;
addpd(xmm5, xmm4) // xmm4 += xmm5;
addpd(xmm2, xmm0) // xmm0 += xmm2;
addpd(xmm6, xmm4) // xmm4 += xmm6;
subpd(xmm0, xmm11) // xmm11 -= xmm0
subpd(xmm4, xmm15) // xmm15 -= xmm4
mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
: // output operands (none)
: // input operands
@@ -540,3 +543,4 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
}

View File

@@ -34,6 +34,9 @@
#include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0
void bli_sgemmtrsm_u_penryn_asm_8x4
(
@@ -75,432 +78,432 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
__asm__ volatile
(
" \n\t"
"movq %2, %%rax \n\t" // load address of a12.
"movq %4, %%rbx \n\t" // load address of b21.
//"movq %10, %%r9 \n\t" // load address of b_next.
" \n\t"
"addq $8 * 16, %%rax \n\t" // increment pointers to allow byte
"addq $8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
" \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t" // initialize loop by pre-loading elements
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b.
"movaps -8 * 16(%%rbx), %%xmm2 \n\t"
" \n\t"
"xorpd %%xmm3, %%xmm3 \n\t"
"xorpd %%xmm4, %%xmm4 \n\t"
"xorpd %%xmm5, %%xmm5 \n\t"
"xorpd %%xmm6, %%xmm6 \n\t"
" \n\t"
"xorpd %%xmm8, %%xmm8 \n\t"
"movaps %%xmm8, %%xmm9 \n\t"
"movaps %%xmm8, %%xmm10 \n\t"
"movaps %%xmm8, %%xmm11 \n\t"
"movaps %%xmm8, %%xmm12 \n\t"
"movaps %%xmm8, %%xmm13 \n\t"
"movaps %%xmm8, %%xmm14 \n\t"
"movaps %%xmm8, %%xmm15 \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rsi \n\t" // i = k_iter;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that
" \n\t" // contains the k_left loop.
" \n\t"
" \n\t"
".LOOPKITER: \n\t" // MAIN LOOP
" \n\t"
"prefetcht0 1264(%%rax) \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -6 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 1
"movaps -5 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -4 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -4 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -3 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
"prefetcht0 1328(%%rax) \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 2
"movaps -3 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -2 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -2 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -1 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 3
"movaps -1 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addq $4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr)
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps 0 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr)
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -8 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -7 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .LOOPKITER \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".CONSIDERKLEFT: \n\t"
" \n\t"
"movq %1, %%rsi \n\t" // i = k_left;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .POSTACCUM \n\t" // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter k_left loop.
" \n\t"
" \n\t"
".LOOPKLEFT: \n\t" // EDGE LOOP
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t" // iteration 0
"movaps -7 * 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps -6 * 16(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps -6 * 16(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps -5 * 16(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addq $4 * 1 * 8, %%rax \n\t" // a += 4 (1 x mr)
"addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr)
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .LOOPKLEFT \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
".POSTACCUM: \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %5, %%rbx \n\t" // load address of b11.
" \n\t"
" \n\t" // xmm8: xmm9: xmm10: xmm11:
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
" \n\t" // ab10 ) ab11 ) ab12 ) ab13 )
" \n\t" //
" \n\t" // xmm12: xmm13: xmm14: xmm15:
" \n\t" // ( ab21 ( ab20 ( ab23 ( ab22
" \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
"movaps %%xmm9, %%xmm0 \n\t"
"movaps %%xmm8, %%xmm1 \n\t"
"unpcklpd %%xmm8, %%xmm0 \n\t"
"unpckhpd %%xmm9, %%xmm1 \n\t"
" \n\t"
"movaps %%xmm11, %%xmm4 \n\t"
"movaps %%xmm10, %%xmm5 \n\t"
"unpcklpd %%xmm10, %%xmm4 \n\t"
"unpckhpd %%xmm11, %%xmm5 \n\t"
" \n\t"
"movaps %%xmm13, %%xmm2 \n\t"
"movaps %%xmm12, %%xmm3 \n\t"
"unpcklpd %%xmm12, %%xmm2 \n\t"
"unpckhpd %%xmm13, %%xmm3 \n\t"
" \n\t"
"movaps %%xmm15, %%xmm6 \n\t"
"movaps %%xmm14, %%xmm7 \n\t"
"unpcklpd %%xmm14, %%xmm6 \n\t"
"unpckhpd %%xmm15, %%xmm7 \n\t"
" \n\t"
" \n\t" // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
" \n\t" // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
" \n\t"
"movq %9, %%rax \n\t" // load address of alpha
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t"
"movaps 1 * 16(%%rbx), %%xmm12 \n\t"
"mulpd %%xmm15, %%xmm8 \n\t" // xmm8 = alpha * ( beta00 beta01 )
"mulpd %%xmm15, %%xmm12 \n\t" // xmm12 = alpha * ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t"
"movaps 3 * 16(%%rbx), %%xmm13 \n\t"
"mulpd %%xmm15, %%xmm9 \n\t" // xmm9 = alpha * ( beta10 beta11 )
"mulpd %%xmm15, %%xmm13 \n\t" // xmm13 = alpha * ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t"
"movaps 5 * 16(%%rbx), %%xmm14 \n\t"
"mulpd %%xmm15, %%xmm10 \n\t" // xmm10 = alpha * ( beta20 beta21 )
"mulpd %%xmm15, %%xmm14 \n\t" // xmm14 = alpha * ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t"
"mulpd %%xmm15, %%xmm11 \n\t" // xmm11 = alpha * ( beta30 beta31 )
"mulpd 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = alpha * ( beta32 beta33 )
" \n\t"
" \n\t" // (Now scaled by alpha:)
" \n\t" // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 )
" \n\t" // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 )
" \n\t" // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
" \n\t" // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
" \n\t"
"subpd %%xmm0, %%xmm8 \n\t" // xmm8 -= xmm0
"subpd %%xmm1, %%xmm9 \n\t" // xmm9 -= xmm1
"subpd %%xmm2, %%xmm10 \n\t" // xmm10 -= xmm2
"subpd %%xmm3, %%xmm11 \n\t" // xmm11 -= xmm3
"subpd %%xmm4, %%xmm12 \n\t" // xmm12 -= xmm4
"subpd %%xmm5, %%xmm13 \n\t" // xmm13 -= xmm5
"subpd %%xmm6, %%xmm14 \n\t" // xmm14 -= xmm6
"subpd %%xmm7, %%xmm15 \n\t" // xmm15 -= xmm7
" \n\t"
" \n\t"
" \n\t"
".TRSM: \n\t"
" \n\t"
" \n\t"
"movq %3, %%rax \n\t" // load address of a11
"movq %6, %%rcx \n\t" // load address of c11
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c
"movq %8, %%rdi \n\t" // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
" \n\t"
"addq %%rsi, %%rcx \n\t" // c11 += (4-1)*rs_c
"addq %%rsi, %%rcx \n\t"
"addq %%rsi, %%rcx \n\t"
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c;
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0
" \n\t"
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33)
" \n\t"
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1
" \n\t"
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22)
"movddup (2+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha23
" \n\t"
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha23 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha23 * ( beta32 beta33 )
"subpd %%xmm3, %%xmm10 \n\t" // xmm10 -= xmm3
"subpd %%xmm7, %%xmm14 \n\t" // xmm14 -= xmm7
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2
" \n\t"
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11)
"movddup (1+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha12
"movddup (1+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha13
" \n\t"
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha12 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha12 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha13 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha13 * ( beta32 beta33 )
"addpd %%xmm3, %%xmm2 \n\t" // xmm2 += xmm3;
"addpd %%xmm7, %%xmm6 \n\t" // xmm6 += xmm7;
"subpd %%xmm2, %%xmm9 \n\t" // xmm9 -= xmm2
"subpd %%xmm6, %%xmm13 \n\t" // xmm13 -= xmm6
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00)
"movddup (0+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha01
"movddup (0+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha02
"movddup (0+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha03
" \n\t"
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha01 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha01 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha02 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha02 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha03 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha03 * ( beta32 beta33 )
"addpd %%xmm2, %%xmm1 \n\t" // xmm1 += xmm2;
"addpd %%xmm6, %%xmm5 \n\t" // xmm5 += xmm6;
"addpd %%xmm3, %%xmm1 \n\t" // xmm1 += xmm3;
"addpd %%xmm7, %%xmm5 \n\t" // xmm5 += xmm7;
"subpd %%xmm1, %%xmm8 \n\t" // xmm8 -= xmm1
"subpd %%xmm5, %%xmm12 \n\t" // xmm12 -= xmm5
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1]
" \n\t"
" \n\t"
" \n\t"
mov(%2, rax) // load address of a12.
mov(%4, rbx) // load address of b21.
//mov(%10, r9) // load address of b_next.
add(imm(8*16), rax) // increment pointers to allow byte
add(imm(8*16), rbx) // offsets in the unrolled iterations.
movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
movaps(mem(rax, -7*16), xmm1) // of a and b.
movaps(mem(rbx, -8*16), xmm2)
xorpd(xmm3, xmm3)
xorpd(xmm4, xmm4)
xorpd(xmm5, xmm5)
xorpd(xmm6, xmm6)
xorpd(xmm8, xmm8)
movaps(xmm8, xmm9)
movaps(xmm8, xmm10)
movaps(xmm8, xmm11)
movaps(xmm8, xmm12)
movaps(xmm8, xmm13)
movaps(xmm8, xmm14)
movaps(xmm8, xmm15)
mov(%0, rsi) // i = k_iter;
test(rsi, rsi) // check i via logical AND.
je(.CONSIDERKLEFT) // if i == 0, jump to code that
// contains the k_left loop.
label(.LOOPKITER) // MAIN LOOP
prefetch(0, mem(rax, 1264))
addpd(xmm3, xmm11) // iteration 0
movaps(mem(rbx, -7*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -6*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -6*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -5*16), xmm1)
addpd(xmm3, xmm11) // iteration 1
movaps(mem(rbx, -5*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -4*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -4*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -3*16), xmm1)
prefetch(0, mem(rax, 1328))
addpd(xmm3, xmm11) // iteration 2
movaps(mem(rbx, -3*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -2*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -2*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -1*16), xmm1)
addpd(xmm3, xmm11) // iteration 3
movaps(mem(rbx, -1*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
add(imm(4*4*8), rax) // a += 4*4 (unroll x mr)
addpd(xmm2, xmm9)
movaps(mem(rbx, 0*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -8*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -7*16), xmm1)
dec(rsi) // i -= 1;
jne(.LOOPKITER) // iterate again if i != 0.
label(.CONSIDERKLEFT)
mov(%1, rsi) // i = k_left;
test(rsi, rsi) // check i via logical AND.
je(.POSTACCUM) // if i == 0, we're done; jump to end.
// else, we prepare to enter k_left loop.
label(.LOOPKLEFT) // EDGE LOOP
addpd(xmm3, xmm11) // iteration 0
movaps(mem(rbx, -7*16), xmm3)
addpd(xmm4, xmm15)
movaps(xmm2, xmm4)
pshufd(imm(0x4e), xmm2, xmm7)
mulpd(xmm0, xmm2)
mulpd(xmm1, xmm4)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
movaps(xmm7, xmm6)
mulpd(xmm0, xmm7)
mulpd(xmm1, xmm6)
addpd(xmm2, xmm9)
movaps(mem(rbx, -6*16), xmm2)
addpd(xmm4, xmm13)
movaps(xmm3, xmm4)
pshufd(imm(0x4e), xmm3, xmm5)
mulpd(xmm0, xmm3)
mulpd(xmm1, xmm4)
addpd(xmm7, xmm8)
addpd(xmm6, xmm12)
movaps(xmm5, xmm6)
mulpd(xmm0, xmm5)
movaps(mem(rax, -6*16), xmm0)
mulpd(xmm1, xmm6)
movaps(mem(rax, -5*16), xmm1)
add(imm(4*1*8), rax) // a += 4 (1 x mr)
add(imm(4*1*8), rbx) // b += 4 (1 x nr)
dec(rsi) // i -= 1;
jne(.LOOPKLEFT) // iterate again if i != 0.
label(.POSTACCUM)
addpd(xmm3, xmm11)
addpd(xmm4, xmm15)
addpd(xmm5, xmm10)
addpd(xmm6, xmm14)
mov(%5, rbx) // load address of b11.
// xmm8: xmm9: xmm10: xmm11:
// ( ab01 ( ab00 ( ab03 ( ab02
// ab10 ) ab11 ) ab12 ) ab13 )
//
// xmm12: xmm13: xmm14: xmm15:
// ( ab21 ( ab20 ( ab23 ( ab22
// ab30 ) ab31 ) ab32 ) ab33 )
movaps(xmm9, xmm0)
movaps(xmm8, xmm1)
unpcklpd(xmm8, xmm0)
unpckhpd(xmm9, xmm1)
movaps(xmm11, xmm4)
movaps(xmm10, xmm5)
unpcklpd(xmm10, xmm4)
unpckhpd(xmm11, xmm5)
movaps(xmm13, xmm2)
movaps(xmm12, xmm3)
unpcklpd(xmm12, xmm2)
unpckhpd(xmm13, xmm3)
movaps(xmm15, xmm6)
movaps(xmm14, xmm7)
unpcklpd(xmm14, xmm6)
unpckhpd(xmm15, xmm7)
// xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
// xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
// xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
// xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
mov(%9, rax) // load address of alpha
movddup(mem(rax), xmm15) // load alpha and duplicate
movaps(mem(rbx, 0*16), xmm8)
movaps(mem(rbx, 1*16), xmm12)
mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 )
mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 )
movaps(mem(rbx, 2*16), xmm9)
movaps(mem(rbx, 3*16), xmm13)
mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 )
mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 )
movaps(mem(rbx, 4*16), xmm10)
movaps(mem(rbx, 5*16), xmm14)
mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 )
mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 )
movaps(mem(rbx, 6*16), xmm11)
mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
// (Now scaled by alpha:)
// xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 )
// xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 )
// xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
// xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
subpd(xmm0, xmm8) // xmm8 -= xmm0
subpd(xmm1, xmm9) // xmm9 -= xmm1
subpd(xmm2, xmm10) // xmm10 -= xmm2
subpd(xmm3, xmm11) // xmm11 -= xmm3
subpd(xmm4, xmm12) // xmm12 -= xmm4
subpd(xmm5, xmm13) // xmm13 -= xmm5
subpd(xmm6, xmm14) // xmm14 -= xmm6
subpd(xmm7, xmm15) // xmm15 -= xmm7
label(.TRSM)
mov(%3, rax) // load address of a11
mov(%6, rcx) // load address of c11
mov(%7, rsi) // load rs_c
mov(%8, rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
add(rsi, rcx) // c11 += (4-1)*rs_c
add(rsi, rcx)
add(rsi, rcx)
lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
// iteration 0
movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
sub(rsi, rcx) // c11 -= rs_c
sub(rsi, rdx) // c11_2 -= rs_c
// iteration 1
movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
movaps(xmm3, xmm7) // xmm7 = xmm3
mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
subpd(xmm3, xmm10) // xmm10 -= xmm3
subpd(xmm7, xmm14) // xmm14 -= xmm7
mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
sub(rsi, rcx) // c11 -= rs_c
sub(rsi, rdx) // c11_2 -= rs_c
// iteration 2
movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
movaps(xmm2, xmm6) // xmm6 = xmm2
movaps(xmm3, xmm7) // xmm7 = xmm3
mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 )
mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 )
mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 )
addpd(xmm3, xmm2) // xmm2 += xmm3;
addpd(xmm7, xmm6) // xmm6 += xmm7;
subpd(xmm2, xmm9) // xmm9 -= xmm2
subpd(xmm6, xmm13) // xmm13 -= xmm6
mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
sub(rsi, rcx) // c11 -= rs_c
sub(rsi, rdx) // c11_2 -= rs_c
// iteration 3
movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
movaps(xmm1, xmm5) // xmm5 = xmm1
movaps(xmm2, xmm6) // xmm6 = xmm2
movaps(xmm3, xmm7) // xmm7 = xmm3
mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 )
mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 )
mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 )
mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 )
mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 )
mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 )
addpd(xmm2, xmm1) // xmm1 += xmm2;
addpd(xmm6, xmm5) // xmm5 += xmm6;
addpd(xmm3, xmm1) // xmm1 += xmm3;
addpd(xmm7, xmm5) // xmm5 += xmm7;
subpd(xmm1, xmm8) // xmm8 -= xmm1
subpd(xmm5, xmm12) // xmm12 -= xmm5
mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
: // output operands (none)
: // input operands
@@ -526,3 +529,4 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
}

View File

@@ -34,6 +34,9 @@
#include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0
void bli_strsm_l_penryn_asm_8x4
(
@@ -63,138 +66,138 @@ void bli_dtrsm_l_penryn_asm_4x4
__asm__ volatile
(
" \n\t"
"movq %1, %%rbx \n\t" // load address of b11.
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t" // xmm8 = ( beta00 beta01 )
"movaps 1 * 16(%%rbx), %%xmm12 \n\t" // xmm9 = ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t" // xmm10 = ( beta10 beta11 )
"movaps 3 * 16(%%rbx), %%xmm13 \n\t" // xmm11 = ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t" // xmm12 = ( beta20 beta21 )
"movaps 5 * 16(%%rbx), %%xmm14 \n\t" // xmm13 = ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t" // xmm14 = ( beta30 beta31 )
"movaps 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = ( beta32 beta33 )
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rax \n\t" // load address of a11
"movq %2, %%rcx \n\t" // load address of c11
" \n\t"
"movq %3, %%rsi \n\t" // load rs_c
"movq %4, %%rdi \n\t" // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
" \n\t"
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00)
" \n\t"
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1
" \n\t"
"movddup (1+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha10
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha10 * ( beta02 beta03 )
"subpd %%xmm0, %%xmm9 \n\t" // xmm9 -= xmm0
"subpd %%xmm4, %%xmm13 \n\t" // xmm13 -= xmm4
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2
" \n\t"
"movddup (2+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha20
"movddup (2+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha21
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha20 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha21 * ( beta12 beta13 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5;
"subpd %%xmm0, %%xmm10 \n\t" // xmm10 -= xmm0
"subpd %%xmm4, %%xmm14 \n\t" // xmm14 -= xmm4
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1]
"addq %%rsi, %%rcx \n\t" // c11 += rs_c
"addq %%rsi, %%rdx \n\t" // c11_2 += rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3
" \n\t"
"movddup (3+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = alpha30
"movddup (3+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha31
"movddup (3+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha32
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33)
" \n\t"
"movaps %%xmm0, %%xmm4 \n\t" // xmm4 = xmm0
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2
"mulpd %%xmm8, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 )
"mulpd %%xmm12, %%xmm4 \n\t" // xmm4 = alpha30 * ( beta02 beta03 )
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha31 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha32 * ( beta22 beta23 )
"addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1;
"addpd %%xmm5, %%xmm4 \n\t" // xmm4 += xmm5;
"addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2;
"addpd %%xmm6, %%xmm4 \n\t" // xmm4 += xmm6;
"subpd %%xmm0, %%xmm11 \n\t" // xmm11 -= xmm0
"subpd %%xmm4, %%xmm15 \n\t" // xmm15 -= xmm4
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1]
" \n\t"
" \n\t"
" \n\t"
mov(%1, rbx) // load address of b11.
movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 )
movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 )
movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
mov(%0, rax) // load address of a11
mov(%2, rcx) // load address of c11
mov(%3, rsi) // load rs_c
mov(%4, rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
// iteration 0
movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
add(rsi, rcx) // c11 += rs_c
add(rsi, rdx) // c11_2 += rs_c
// iteration 1
movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
movaps(xmm0, xmm4) // xmm4 = xmm0
mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
subpd(xmm0, xmm9) // xmm9 -= xmm0
subpd(xmm4, xmm13) // xmm13 -= xmm4
mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
add(rsi, rcx) // c11 += rs_c
add(rsi, rdx) // c11_2 += rs_c
// iteration 2
movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
movaps(xmm0, xmm4) // xmm4 = xmm0
movaps(xmm1, xmm5) // xmm5 = xmm1
mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 )
mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 )
mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 )
addpd(xmm1, xmm0) // xmm0 += xmm1;
addpd(xmm5, xmm4) // xmm4 += xmm5;
subpd(xmm0, xmm10) // xmm10 -= xmm0
subpd(xmm4, xmm14) // xmm14 -= xmm4
mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
add(rsi, rcx) // c11 += rs_c
add(rsi, rdx) // c11_2 += rs_c
// iteration 3
movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
movaps(xmm0, xmm4) // xmm4 = xmm0
movaps(xmm1, xmm5) // xmm5 = xmm1
movaps(xmm2, xmm6) // xmm6 = xmm2
mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 )
mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 )
mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 )
mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 )
mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 )
mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 )
addpd(xmm1, xmm0) // xmm0 += xmm1;
addpd(xmm5, xmm4) // xmm4 += xmm5;
addpd(xmm2, xmm0) // xmm0 += xmm2;
addpd(xmm6, xmm4) // xmm4 += xmm6;
subpd(xmm0, xmm11) // xmm11 -= xmm0
subpd(xmm4, xmm15) // xmm15 -= xmm4
mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
: // output operands (none)
: // input operands
@@ -214,3 +217,4 @@ void bli_dtrsm_l_penryn_asm_4x4
}

View File

@@ -34,6 +34,9 @@
#include "blis.h"
#define BLIS_ASM_SYNTAX_ATT
#include "bli_x86_asm_macros.h"
#if 0
void bli_strsm_u_penryn_asm_8x4
(
@@ -63,141 +66,141 @@ void bli_dtrsm_u_penryn_asm_4x4
__asm__ volatile
(
" \n\t"
"movq %1, %%rbx \n\t" // load address of b11.
" \n\t"
"movaps 0 * 16(%%rbx), %%xmm8 \n\t" // xmm8 = ( beta00 beta01 )
"movaps 1 * 16(%%rbx), %%xmm12 \n\t" // xmm9 = ( beta02 beta03 )
"movaps 2 * 16(%%rbx), %%xmm9 \n\t" // xmm10 = ( beta10 beta11 )
"movaps 3 * 16(%%rbx), %%xmm13 \n\t" // xmm11 = ( beta12 beta13 )
"movaps 4 * 16(%%rbx), %%xmm10 \n\t" // xmm12 = ( beta20 beta21 )
"movaps 5 * 16(%%rbx), %%xmm14 \n\t" // xmm13 = ( beta22 beta23 )
"movaps 6 * 16(%%rbx), %%xmm11 \n\t" // xmm14 = ( beta30 beta31 )
"movaps 7 * 16(%%rbx), %%xmm15 \n\t" // xmm15 = ( beta32 beta33 )
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rax \n\t" // load address of a11
"movq %2, %%rcx \n\t" // load address of c11
" \n\t"
"movq %3, %%rsi \n\t" // load rs_c
"movq %4, %%rdi \n\t" // load cs_c
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
" \n\t"
"addq %%rsi, %%rcx \n\t" // c11 += (4-1)*rs_c
"addq %%rsi, %%rcx \n\t"
"addq %%rsi, %%rcx \n\t"
"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // c11_2 = c11 + 2*cs_c;
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 0
" \n\t"
"movddup (3+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = (1/alpha33)
" \n\t"
"mulpd %%xmm3, %%xmm11 \n\t" // xmm11 *= (1/alpha33);
"mulpd %%xmm3, %%xmm15 \n\t" // xmm15 *= (1/alpha33);
" \n\t"
"movaps %%xmm11, 6 * 16(%%rbx) \n\t" // store ( beta30 beta31 ) = xmm11
"movaps %%xmm15, 7 * 16(%%rbx) \n\t" // store ( beta32 beta33 ) = xmm15
"movlpd %%xmm11, (%%rcx) \n\t" // store ( gamma30 ) = xmm11[0]
"movhpd %%xmm11, (%%rcx,%%rdi) \n\t" // store ( gamma31 ) = xmm11[1]
"movlpd %%xmm15, (%%rdx) \n\t" // store ( gamma32 ) = xmm15[0]
"movhpd %%xmm15, (%%rdx,%%rdi) \n\t" // store ( gamma33 ) = xmm15[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1
" \n\t"
"movddup (2+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = (1/alpha22)
"movddup (2+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha23
" \n\t"
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha23 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha23 * ( beta32 beta33 )
"subpd %%xmm3, %%xmm10 \n\t" // xmm10 -= xmm3
"subpd %%xmm7, %%xmm14 \n\t" // xmm14 -= xmm7
"mulpd %%xmm2, %%xmm10 \n\t" // xmm10 *= (1/alpha22);
"mulpd %%xmm2, %%xmm14 \n\t" // xmm14 *= (1/alpha22);
" \n\t"
"movaps %%xmm10, 4 * 16(%%rbx) \n\t" // store ( beta20 beta21 ) = xmm10
"movaps %%xmm14, 5 * 16(%%rbx) \n\t" // store ( beta22 beta23 ) = xmm14
"movlpd %%xmm10, (%%rcx) \n\t" // store ( gamma20 ) = xmm10[0]
"movhpd %%xmm10, (%%rcx,%%rdi) \n\t" // store ( gamma21 ) = xmm10[1]
"movlpd %%xmm14, (%%rdx) \n\t" // store ( gamma22 ) = xmm14[0]
"movhpd %%xmm14, (%%rdx,%%rdi) \n\t" // store ( gamma23 ) = xmm14[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2
" \n\t"
"movddup (1+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = (1/alpha11)
"movddup (1+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha12
"movddup (1+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha13
" \n\t"
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha12 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha12 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha13 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha13 * ( beta32 beta33 )
"addpd %%xmm3, %%xmm2 \n\t" // xmm2 += xmm3;
"addpd %%xmm7, %%xmm6 \n\t" // xmm6 += xmm7;
"subpd %%xmm2, %%xmm9 \n\t" // xmm9 -= xmm2
"subpd %%xmm6, %%xmm13 \n\t" // xmm13 -= xmm6
"mulpd %%xmm1, %%xmm9 \n\t" // xmm9 *= (1/alpha11);
"mulpd %%xmm1, %%xmm13 \n\t" // xmm13 *= (1/alpha11);
" \n\t"
"movaps %%xmm9, 2 * 16(%%rbx) \n\t" // store ( beta10 beta11 ) = xmm9
"movaps %%xmm13, 3 * 16(%%rbx) \n\t" // store ( beta12 beta13 ) = xmm13
"movlpd %%xmm9, (%%rcx) \n\t" // store ( gamma10 ) = xmm9[0]
"movhpd %%xmm9, (%%rcx,%%rdi) \n\t" // store ( gamma11 ) = xmm9[1]
"movlpd %%xmm13, (%%rdx) \n\t" // store ( gamma12 ) = xmm13[0]
"movhpd %%xmm13, (%%rdx,%%rdi) \n\t" // store ( gamma13 ) = xmm13[1]
"subq %%rsi, %%rcx \n\t" // c11 -= rs_c
"subq %%rsi, %%rdx \n\t" // c11_2 -= rs_c
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3
" \n\t"
"movddup (0+0*4)*8(%%rax), %%xmm0 \n\t" // load xmm0 = (1/alpha00)
"movddup (0+1*4)*8(%%rax), %%xmm1 \n\t" // load xmm1 = alpha01
"movddup (0+2*4)*8(%%rax), %%xmm2 \n\t" // load xmm2 = alpha02
"movddup (0+3*4)*8(%%rax), %%xmm3 \n\t" // load xmm3 = alpha03
" \n\t"
"movaps %%xmm1, %%xmm5 \n\t" // xmm5 = xmm1
"movaps %%xmm2, %%xmm6 \n\t" // xmm6 = xmm2
"movaps %%xmm3, %%xmm7 \n\t" // xmm7 = xmm3
"mulpd %%xmm9, %%xmm1 \n\t" // xmm1 = alpha01 * ( beta10 beta11 )
"mulpd %%xmm13, %%xmm5 \n\t" // xmm5 = alpha01 * ( beta12 beta13 )
"mulpd %%xmm10, %%xmm2 \n\t" // xmm2 = alpha02 * ( beta20 beta21 )
"mulpd %%xmm14, %%xmm6 \n\t" // xmm6 = alpha02 * ( beta22 beta23 )
"mulpd %%xmm11, %%xmm3 \n\t" // xmm3 = alpha03 * ( beta30 beta31 )
"mulpd %%xmm15, %%xmm7 \n\t" // xmm7 = alpha03 * ( beta32 beta33 )
"addpd %%xmm2, %%xmm1 \n\t" // xmm1 += xmm2;
"addpd %%xmm6, %%xmm5 \n\t" // xmm5 += xmm6;
"addpd %%xmm3, %%xmm1 \n\t" // xmm1 += xmm3;
"addpd %%xmm7, %%xmm5 \n\t" // xmm5 += xmm7;
"subpd %%xmm1, %%xmm8 \n\t" // xmm8 -= xmm1
"subpd %%xmm5, %%xmm12 \n\t" // xmm12 -= xmm5
"mulpd %%xmm0, %%xmm8 \n\t" // xmm8 *= (1/alpha00);
"mulpd %%xmm0, %%xmm12 \n\t" // xmm12 *= (1/alpha00);
" \n\t"
"movaps %%xmm8, 0 * 16(%%rbx) \n\t" // store ( beta00 beta01 ) = xmm8
"movaps %%xmm12, 1 * 16(%%rbx) \n\t" // store ( beta02 beta03 ) = xmm12
"movlpd %%xmm8, (%%rcx) \n\t" // store ( gamma00 ) = xmm8[0]
"movhpd %%xmm8, (%%rcx,%%rdi) \n\t" // store ( gamma01 ) = xmm8[1]
"movlpd %%xmm12, (%%rdx) \n\t" // store ( gamma02 ) = xmm12[0]
"movhpd %%xmm12, (%%rdx,%%rdi) \n\t" // store ( gamma03 ) = xmm12[1]
" \n\t"
" \n\t"
" \n\t"
mov(%1, rbx) // load address of b11.
movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 )
movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 )
movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 )
movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 )
movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
mov(%0, rax) // load address of a11
mov(%2, rcx) // load address of c11
mov(%3, rsi) // load rs_c
mov(%4, rdi) // load cs_c
sal(imm(3), rsi) // rs_c *= sizeof( double )
sal(imm(3), rdi) // cs_c *= sizeof( double )
add(rsi, rcx) // c11 += (4-1)*rs_c
add(rsi, rcx)
add(rsi, rcx)
lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
// iteration 0
movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
sub(rsi, rcx) // c11 -= rs_c
sub(rsi, rdx) // c11_2 -= rs_c
// iteration 1
movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
movaps(xmm3, xmm7) // xmm7 = xmm3
mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
subpd(xmm3, xmm10) // xmm10 -= xmm3
subpd(xmm7, xmm14) // xmm14 -= xmm7
mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1]
movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0]
movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
sub(rsi, rcx) // c11 -= rs_c
sub(rsi, rdx) // c11_2 -= rs_c
// iteration 2
movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
movaps(xmm2, xmm6) // xmm6 = xmm2
movaps(xmm3, xmm7) // xmm7 = xmm3
mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 )
mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 )
mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 )
addpd(xmm3, xmm2) // xmm2 += xmm3;
addpd(xmm7, xmm6) // xmm6 += xmm7;
subpd(xmm2, xmm9) // xmm9 -= xmm2
subpd(xmm6, xmm13) // xmm13 -= xmm6
mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11);
mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1]
movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0]
movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
sub(rsi, rcx) // c11 -= rs_c
sub(rsi, rdx) // c11_2 -= rs_c
// iteration 3
movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
movaps(xmm1, xmm5) // xmm5 = xmm1
movaps(xmm2, xmm6) // xmm6 = xmm2
movaps(xmm3, xmm7) // xmm7 = xmm3
mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 )
mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 )
mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 )
mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 )
mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 )
mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 )
addpd(xmm2, xmm1) // xmm1 += xmm2;
addpd(xmm6, xmm5) // xmm5 += xmm6;
addpd(xmm3, xmm1) // xmm1 += xmm3;
addpd(xmm7, xmm5) // xmm5 += xmm7;
subpd(xmm1, xmm8) // xmm8 -= xmm1
subpd(xmm5, xmm12) // xmm12 -= xmm5
mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00);
mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
: // output operands (none)
: // input operands
@@ -217,3 +220,4 @@ void bli_dtrsm_u_penryn_asm_4x4
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -34,7 +34,8 @@
#include "blis.h"
#include "bli_avx512_macros.h"
#define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define A_L1_PREFETCH_DIST 4 //should be multiple of 2
@@ -305,8 +306,7 @@ void bli_dgemm_skx_asm_16x12_l2(
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
__asm__ volatile
(
BEGIN_ASM
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8))
@@ -525,6 +525,7 @@ void bli_dgemm_skx_asm_16x12_l2(
VZEROUPPER()
END_ASM(
: // output operands
: // input operands
[k] "m" (k),
@@ -543,5 +544,5 @@ void bli_dgemm_skx_asm_16x12_l2(
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory"
);
)
}

View File

@@ -34,7 +34,8 @@
#include "blis.h"
#include "bli_avx512_macros.h"
#define BLIS_ASM_SYNTAX_INTEL
#include "bli_x86_asm_macros.h"
#define CACHELINE_SIZE 64 //size of cache line in bytes
@@ -335,8 +336,7 @@ void bli_sgemm_skx_asm_32x12_l2(
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
__asm__ volatile
(
BEGIN_ASM
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8))
@@ -550,6 +550,7 @@ void bli_sgemm_skx_asm_32x12_l2(
VZEROUPPER()
END_ASM(
: // output operands
: // input operands
[k] "m" (k),
@@ -568,5 +569,5 @@ void bli_sgemm_skx_asm_32x12_l2(
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory"
);
)
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff